yinbq
/

Bagel-Zebra-CoT

Model card Files Files and versions

Bagel-Zebra-CoT / scripts /train_smm.sh

yinbq's picture

Add files using upload-large-folder tool

8834223 verified 11 days ago

history blame contribute delete

1.72 kB

	#!/bin/bash
	# Copyright 2025 Bytedance Ltd. and/or its affiliates.
	# SPDX-License-Identifier: Apache-2.0

	# Change to the project directory
	cd /scratch/by2593/Bagel-Zebra-CoT-origin

	export HF_HOME=/dev/shm/
	export PYTHONPATH=/scratch/by2593/Bagel-Zebra-CoT-origin:$PYTHONPATH
	export WANDB_MODE=offline
	export WANDB_ANONYMOUS=must
	NUM_NODES=1
	NODE_RANK=0
	MASTER_ADDR=localhost
	MASTER_PORT=29500
	NPROC_PER_NODE=8
	MODEL_PATH=/scratch/by2593/hf_cache/hub/models--multimodal-reasoning-lab--Bagel-Zebra-CoT/snapshots/ebce32410ee2062d073feae484ea2c6c1515fba8

	# replace the variables with your own
	torchrun \
	--nnodes=$NUM_NODES \
	--node_rank=$NODE_RANK \
	--nproc_per_node=$NPROC_PER_NODE \
	--master_addr=$MASTER_ADDR \
	--master_port=$MASTER_PORT \
	train/pretrain_unified_navit.py \
	--dataset_config_file ./data/configs/example_smm_semantic.yaml \
	--model_path $MODEL_PATH \
	--layer_module Qwen2MoTDecoderLayer \
	--max_latent_size 64 \
	--resume-from $MODEL_PATH \
	--finetune_from_hf True \
	--auto_resume True \
	--resume-model-only True \
	--finetune-from-ema False \
	--log_every 1 \
	--lr 2e-5 \
	--lr_scheduler cosine \
	--min_lr 1e-6 \
	--num_worker 1 \
	--expected_num_tokens 40000 \
	--max_num_tokens 40000 \
	--max_num_tokens_per_sample 40000 \
	--prefer_buffer_before 10000 \
	--num_shard=$NPROC_PER_NODE \
	--sharding_strategy="HYBRID_SHARD" \
	--wandb_project "zebra-cot" \
	--wandb_name "h200-zebra-cot-$(date +%Y%m%d_%H%M%S)" \
	--save_every 100 \
	--warmup_steps 50 \
	--total_steps 5000 \
	--results_dir results/ \
	--checkpoint_dir results/checkpoints_smm_semantic_part1_v1_origin/ > run.out 2> run.err \
	--cpu_offload True \


	# bash scripts/train_smm.sh