VLN-PE / r2r /fine_tuned /rdp /config.json

zhaohui

add r2r models

23a3cfe 4 months ago

7.12 kB

	{
	"architectures": [
	"RDPNet"
	],
	"model_cfg": {
	"checkpoint_folder": "data/checkpoints/20250619_rdp_train_new/ckpts",
	"eval": {
	"action": "descrete",
	"ckpt_to_load": null,
	"len_traj_act": 2,
	"max_steps": 195,
	"num_sample": 1,
	"pm_threshold": 0.9,
	"rotation_threshold": 0.01,
	"sample": true,
	"save_results": true,
	"split": [
	"val_unseen"
	],
	"start_eval_epoch": -1,
	"step_interval": 80,
	"stop_mode": "stop_progress",
	"success_distance": 3.0,
	"use_ckpt_config": false
	},
	"il": {
	"batch_size": 4,
	"camera_name": "pano_camera_0",
	"ckpt_to_load": "",
	"dataset_3dgs_root_dir": "data/datasets/3dgs",
	"dataset_grutopia10_root_dir": "data/datasets/grutopia10",
	"dataset_r2r_root_dir": "data/datasets/R2R_VLNCE_v1-3_corrected",
	"epochs": 50,
	"filter_failure": {
	"min_rgb_nums": 15,
	"use": true
	},
	"inflection_weight_coef": null,
	"lmdb_features_dir": "data/sample_episodes/20250211_sample_origin/sample_data.lmdb",
	"load_from_ckpt": false,
	"load_from_pretrain": true,
	"loss": {
	"alpha": 0.0001,
	"dist_scale": 1
	},
	"lr": 0.0001,
	"num_workers": 8,
	"report_to": "wandb",
	"save_filter_frozen_weights": true,
	"save_interval_epochs": 5,
	"save_interval_steps": null,
	"use_descrete_dataset": true,
	"use_iw": null,
	"warmup_ratio": 0.1,
	"weight_decay": 0.0001
	},
	"local_rank": 0,
	"log_dir": "data/checkpoints/20250619_rdp_train_new/logs",
	"model": {
	"ablate_depth": null,
	"ablate_instruction": null,
	"ablate_rgb": null,
	"cross_modal_encoder": {
	"hidden_size": 512,
	"input_type": 3,
	"load_model": false,
	"num_attention_heads": 8,
	"num_x_layers": 2,
	"txt_to_img": true,
	"txt_to_img_layer": 2
	},
	"depth_encoder": null,
	"diffusion_policy": {
	"action_stats": {
	"max": [
	0.25,
	0.25,
	0.27
	],
	"min": [
	-0.25,
	-0.25,
	-0.27
	]
	},
	"clip_sample": true,
	"cls_free_guidance_scale": 1.5,
	"cls_mask_method": "mask_token",
	"cls_mask_ratio": 0.25,
	"len_traj_pred": 8,
	"metric_waypoint_spacing": 1,
	"num_diffusion_iters": 20,
	"pred_type": "epsilon",
	"random_mask_instr": true,
	"random_mask_rgb": true,
	"scheduler": "DDPM",
	"stop_weight": 1,
	"transformer_encoding_size": 512,
	"transformer_n_cond_layers": 1,
	"transformer_n_layers": 3,
	"transformer_p_drop_emb": 0.2,
	"txt_len": 80,
	"type": "transformer",
	"use": true,
	"use_cls_free_guidance": true,
	"waypoint_spacing": 1
	},
	"distance_predictor": {
	"normalize": false,
	"use": false
	},
	"eval": {
	"action": "descrete",
	"auto_remove": false,
	"ckpt_to_load": null,
	"distance_threshold": 1.5,
	"episode_count": -1,
	"len_traj_act": 4,
	"load_eval_subset": true,
	"max_len_traj_act": 8,
	"max_steps": 195,
	"min_displacement": 0.15,
	"min_len_traj_act": 3,
	"num_sample": 1,
	"pm_threshold": 0.9,
	"re_eval": false,
	"rotation_threshold": 0.01,
	"sample": null,
	"save_results": true,
	"sim_cfg_file": "vln/configs/sim_cfg_policy_h1_eval.yaml",
	"split": [
	"val_unseen"
	],
	"start_eval_epoch": -1,
	"step_interval": 80,
	"stop_mode": "stop_progress",
	"stop_progress_threshold": 0.85,
	"stop_x_threshold": 0.015,
	"stop_y_threshold": 0.015,
	"stop_yaw_threshold": 0.05,
	"success_distance": 3.0,
	"train_eval_interval": 100,
	"use_ckpt_config": false,
	"use_dynamic_len_traj_act": false,
	"vln_cfg_file": "vln/configs/vln_cfg_policy_eval.yaml"
	},
	"image_encoder": {
	"depth": {
	"backbone": "resnet50",
	"bottleneck": "resnet",
	"cnn_type": "VlnResnetDepthEncoder",
	"ddppo_checkpoint": "data/ddppo-models/gibson-4plus-mp3d-train-val-test-resnet50.pth",
	"feature_dim": 768,
	"load_model": true,
	"output_size": 128,
	"projection_dim": 512,
	"update_depth_encoder": false
	},
	"dropout": 0.1,
	"env_drop": 0.3,
	"img_stack_nums": 4,
	"rgb": {
	"feature_dim": 768,
	"img_mod": "multi_patches_avg_pooling",
	"load_model": true,
	"model_name": "clip-long",
	"model_path": "data/pretrained/clip-long/longclip-B.pt",
	"multi_patches_num": 5,
	"projection_dim": 512,
	"rgb_proj": false,
	"update_rgb_encoder": false
	},
	"use_env_drop": true,
	"use_stack": false
	},
	"imu_encoder": {
	"encoding_size": 64,
	"input_size": 3,
	"to_local_coords": true,
	"use": true
	},
	"instruction_encoder": null,
	"learn_angle": true,
	"len_traj_act": 4,
	"max_step": 200,
	"normalize_rgb": null,
	"policy_name": "RDP_Policy",
	"prev_action_encoder": {
	"encoding_size": 64,
	"input_size": null,
	"to_local_coords": null,
	"type": "continuous",
	"use": null
	},
	"progress_monitor": {
	"alpha": null,
	"concat_state_txt": true,
	"use": true
	},
	"rgb_encoder": null,
	"seq2seq": null,
	"state_encoder": {
	"dropout_rate": 0.2,
	"hidden_size": 512,
	"num_recurrent_layers": 1,
	"rgb_depth_embed_method": "flat",
	"rnn_type": "GRU",
	"use_dropout": false
	},
	"stop_progress_predictor": {
	"concat_state_txt": true,
	"loss_alpha": 10,
	"type": "continuous",
	"use": true
	},
	"text_encoder": {
	"ablate": null,
	"embedding_size": 512,
	"eot_token": 49407,
	"final_state_only": null,
	"hidden_size": 512,
	"load_model": true,
	"max_length": 248,
	"model_name": "clip-long",
	"model_path": "data/pretrained/clip-long/longclip-B.pt",
	"num_l_layers": 6,
	"pad_token": 0,
	"sot_token": 49406,
	"type": "clip-long",
	"update_text_encoder": false,
	"vocab_size": 50265
	},
	"use_iw": false
	},
	"model_name": "rdp",
	"name": "20250619_rdp_train_new",
	"num_gpus": 4,
	"output_dir": "data/checkpoints/20250619_rdp_train_new/ckpts",
	"seed": 0,
	"tensorboard_dir": "data/checkpoints/20250619_rdp_train_new/tensorboard",
	"torch_gpu_id": 0,
	"torch_gpu_ids": [
	0,
	1,
	2,
	3
	],
	"world_size": 4
	},
	"model_type": "rdp",
	"torch_dtype": "float32",
	"transformers_version": "4.26.1"
	}