StarVLA
/

Qwen-GR00T-Bridge

Model card Files Files and versions

Qwen-GR00T-Bridge / config.yaml

Jinhuiye's picture

Update config.yaml

58987c0 verified 29 days ago

3.17 kB

	run_id: 1008_qwenLfm_briage
	run_root_dir: ./results/Checkpoints
	seed: 42
	trackers:
	- jsonl
	- wandb
	wandb_entity: jinhuiye
	wandb_project: InternM1
	is_debug: false
	framework:
	name: QwenGR00T
	qwenvl:
	base_vlm: StarVLA/Qwen2.5-VL-3B-Instruct-Action
	attn_implementation: flash_attention_2
	vl_hidden_dim: 2048
	dino:
	dino_backbone: dinov2_vits14
	action_model:
	action_model_type: DiT-L
	hidden_size: 1024
	add_pos_embed: true
	max_seq_len: 1024
	action_dim: 7
	state_dim: 7
	future_action_window_size: 15
	action_horizon: 16
	past_action_window_size: 0
	repeated_diffusion_steps: 8
	noise_beta_alpha: 1.5
	noise_beta_beta: 1.0
	noise_s: 0.999
	num_timestep_buckets: 1000
	num_inference_timesteps: 4
	num_target_vision_tokens: 32
	diffusion_model_cfg:
	cross_attention_dim: 2048
	dropout: 0.2
	final_dropout: true
	interleave_self_attention: true
	norm_type: ada_norm
	num_layers: 16
	output_dim: 1024
	positional_embeddings: null
	action_hidden_dim: 2048
	datasets:
	vlm_data:
	dataset_py: vlm_datasets
	dataformat: llava_json
	dataset_use: aokvqa_cauldron_llava_format,sharegpt4v_coco,sharegpt4v_knowledge,sharegpt4v_llava,sharegpt4v_sam,asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
	eval_dataset: aokvqa_cauldron_llava_format
	data_flatten: false
	base_interval: 2
	max_pixels: 50176
	min_pixels: 784
	model_max_length: 2048
	model_type: qwen2.5vl
	per_device_batch_size: 3
	vla_data:
	dataset_py: lerobot_datasets
	data_root_dir: playground/Datasets/OXE_LEROBOT
	data_mix: bridge
	action_type: delta_ee
	CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
	Locate their bounding boxes in [x1,y1,x2,y2] format.
	CoT_answer: bbox
	default_image_resolution:
	- 3
	- 224
	- 224
	per_device_batch_size: 16
	load_all_data_for_training: true
	obs:
	- image_0
	image_size:
	- 224
	- 224
	trainer:
	epochs: 100
	max_train_steps: 100000
	num_warmup_steps: 10000
	save_interval: 5000
	eval_interval: 1000
	learning_rate:
	base: 3.0e-05
	qwen_vl_interface: 1.0e-05
	action_model: 0.0001
	lr_scheduler_type: cosine_with_min_lr
	scheduler_specific_kwargs:
	min_lr: 5.0e-07
	freeze_modules: true
	loss_scale:
	vla: 1.0
	vlm: 0.1
	repeated_diffusion_steps: 4
	max_grad_norm: 1.0
	warmup_ratio: 0.1
	weight_decay: 0.0
	logging_frequency: 10
	gradient_clipping: 1.0
	gradient_accumulation_steps: 1
	optimizer:
	name: AdamW
	betas:
	- 0.9
	- 0.95
	eps: 1.0e-08
	weight_decay: 1.0e-08
	is_resume: false
	resume_epoch: null
	resume_step: null
	enable_gradient_checkpointing: true
	enable_mixed_precision_training: true
	is_resume: false
	output_dir: ./results/Checkpoints/1008_qwenLfm_briage