VLN-PE / r2r /fine_tuned /rdp /config.json
zhaohui
add r2r models
23a3cfe
{
"architectures": [
"RDPNet"
],
"model_cfg": {
"checkpoint_folder": "data/checkpoints/20250619_rdp_train_new/ckpts",
"eval": {
"action": "descrete",
"ckpt_to_load": null,
"len_traj_act": 2,
"max_steps": 195,
"num_sample": 1,
"pm_threshold": 0.9,
"rotation_threshold": 0.01,
"sample": true,
"save_results": true,
"split": [
"val_unseen"
],
"start_eval_epoch": -1,
"step_interval": 80,
"stop_mode": "stop_progress",
"success_distance": 3.0,
"use_ckpt_config": false
},
"il": {
"batch_size": 4,
"camera_name": "pano_camera_0",
"ckpt_to_load": "",
"dataset_3dgs_root_dir": "data/datasets/3dgs",
"dataset_grutopia10_root_dir": "data/datasets/grutopia10",
"dataset_r2r_root_dir": "data/datasets/R2R_VLNCE_v1-3_corrected",
"epochs": 50,
"filter_failure": {
"min_rgb_nums": 15,
"use": true
},
"inflection_weight_coef": null,
"lmdb_features_dir": "data/sample_episodes/20250211_sample_origin/sample_data.lmdb",
"load_from_ckpt": false,
"load_from_pretrain": true,
"loss": {
"alpha": 0.0001,
"dist_scale": 1
},
"lr": 0.0001,
"num_workers": 8,
"report_to": "wandb",
"save_filter_frozen_weights": true,
"save_interval_epochs": 5,
"save_interval_steps": null,
"use_descrete_dataset": true,
"use_iw": null,
"warmup_ratio": 0.1,
"weight_decay": 0.0001
},
"local_rank": 0,
"log_dir": "data/checkpoints/20250619_rdp_train_new/logs",
"model": {
"ablate_depth": null,
"ablate_instruction": null,
"ablate_rgb": null,
"cross_modal_encoder": {
"hidden_size": 512,
"input_type": 3,
"load_model": false,
"num_attention_heads": 8,
"num_x_layers": 2,
"txt_to_img": true,
"txt_to_img_layer": 2
},
"depth_encoder": null,
"diffusion_policy": {
"action_stats": {
"max": [
0.25,
0.25,
0.27
],
"min": [
-0.25,
-0.25,
-0.27
]
},
"clip_sample": true,
"cls_free_guidance_scale": 1.5,
"cls_mask_method": "mask_token",
"cls_mask_ratio": 0.25,
"len_traj_pred": 8,
"metric_waypoint_spacing": 1,
"num_diffusion_iters": 20,
"pred_type": "epsilon",
"random_mask_instr": true,
"random_mask_rgb": true,
"scheduler": "DDPM",
"stop_weight": 1,
"transformer_encoding_size": 512,
"transformer_n_cond_layers": 1,
"transformer_n_layers": 3,
"transformer_p_drop_emb": 0.2,
"txt_len": 80,
"type": "transformer",
"use": true,
"use_cls_free_guidance": true,
"waypoint_spacing": 1
},
"distance_predictor": {
"normalize": false,
"use": false
},
"eval": {
"action": "descrete",
"auto_remove": false,
"ckpt_to_load": null,
"distance_threshold": 1.5,
"episode_count": -1,
"len_traj_act": 4,
"load_eval_subset": true,
"max_len_traj_act": 8,
"max_steps": 195,
"min_displacement": 0.15,
"min_len_traj_act": 3,
"num_sample": 1,
"pm_threshold": 0.9,
"re_eval": false,
"rotation_threshold": 0.01,
"sample": null,
"save_results": true,
"sim_cfg_file": "vln/configs/sim_cfg_policy_h1_eval.yaml",
"split": [
"val_unseen"
],
"start_eval_epoch": -1,
"step_interval": 80,
"stop_mode": "stop_progress",
"stop_progress_threshold": 0.85,
"stop_x_threshold": 0.015,
"stop_y_threshold": 0.015,
"stop_yaw_threshold": 0.05,
"success_distance": 3.0,
"train_eval_interval": 100,
"use_ckpt_config": false,
"use_dynamic_len_traj_act": false,
"vln_cfg_file": "vln/configs/vln_cfg_policy_eval.yaml"
},
"image_encoder": {
"depth": {
"backbone": "resnet50",
"bottleneck": "resnet",
"cnn_type": "VlnResnetDepthEncoder",
"ddppo_checkpoint": "data/ddppo-models/gibson-4plus-mp3d-train-val-test-resnet50.pth",
"feature_dim": 768,
"load_model": true,
"output_size": 128,
"projection_dim": 512,
"update_depth_encoder": false
},
"dropout": 0.1,
"env_drop": 0.3,
"img_stack_nums": 4,
"rgb": {
"feature_dim": 768,
"img_mod": "multi_patches_avg_pooling",
"load_model": true,
"model_name": "clip-long",
"model_path": "data/pretrained/clip-long/longclip-B.pt",
"multi_patches_num": 5,
"projection_dim": 512,
"rgb_proj": false,
"update_rgb_encoder": false
},
"use_env_drop": true,
"use_stack": false
},
"imu_encoder": {
"encoding_size": 64,
"input_size": 3,
"to_local_coords": true,
"use": true
},
"instruction_encoder": null,
"learn_angle": true,
"len_traj_act": 4,
"max_step": 200,
"normalize_rgb": null,
"policy_name": "RDP_Policy",
"prev_action_encoder": {
"encoding_size": 64,
"input_size": null,
"to_local_coords": null,
"type": "continuous",
"use": null
},
"progress_monitor": {
"alpha": null,
"concat_state_txt": true,
"use": true
},
"rgb_encoder": null,
"seq2seq": null,
"state_encoder": {
"dropout_rate": 0.2,
"hidden_size": 512,
"num_recurrent_layers": 1,
"rgb_depth_embed_method": "flat",
"rnn_type": "GRU",
"use_dropout": false
},
"stop_progress_predictor": {
"concat_state_txt": true,
"loss_alpha": 10,
"type": "continuous",
"use": true
},
"text_encoder": {
"ablate": null,
"embedding_size": 512,
"eot_token": 49407,
"final_state_only": null,
"hidden_size": 512,
"load_model": true,
"max_length": 248,
"model_name": "clip-long",
"model_path": "data/pretrained/clip-long/longclip-B.pt",
"num_l_layers": 6,
"pad_token": 0,
"sot_token": 49406,
"type": "clip-long",
"update_text_encoder": false,
"vocab_size": 50265
},
"use_iw": false
},
"model_name": "rdp",
"name": "20250619_rdp_train_new",
"num_gpus": 4,
"output_dir": "data/checkpoints/20250619_rdp_train_new/ckpts",
"seed": 0,
"tensorboard_dir": "data/checkpoints/20250619_rdp_train_new/tensorboard",
"torch_gpu_id": 0,
"torch_gpu_ids": [
0,
1,
2,
3
],
"world_size": 4
},
"model_type": "rdp",
"torch_dtype": "float32",
"transformers_version": "4.26.1"
}