yinbq
/

Bagel-Zebra-CoT

Model card Files Files and versions

Bagel-Zebra-CoT / scripts /train.sh

yinbq's picture

Add files using upload-large-folder tool

8834223 verified 15 days ago

history blame contribute delete

1.34 kB

	#!/bin/bash
	# Copyright 2025 Bytedance Ltd. and/or its affiliates.
	# SPDX-License-Identifier: Apache-2.0
	export HF_HOME=/dev/shm/
	NUM_NODES=1
	NODE_RANK=0
	MASTER_ADDR=localhost
	MASTER_PORT=29500
	NPROC_PER_NODE=8
	MODEL_PATH=/dev/shm/models/BAGEL-7B-MoT

	# replace the variables with your own
	torchrun \
	--nnodes=$NUM_NODES \
	--node_rank=$NODE_RANK \
	--nproc_per_node=$NPROC_PER_NODE \
	--master_addr=$MASTER_ADDR \
	--master_port=$MASTER_PORT \
	train/pretrain_unified_navit.py \
	--dataset_config_file ./data/configs/example.yaml \
	--model_path $MODEL_PATH \
	--layer_module Qwen2MoTDecoderLayer \
	--max_latent_size 64 \
	--resume-from $MODEL_PATH \
	--finetune_from_hf True \
	--auto_resume True \
	--resume-model-only True \
	--finetune-from-ema True \
	--log_every 1 \
	--lr 2e-5 \
	--lr_scheduler cosine \
	--min_lr 1e-6 \
	--num_worker 1 \
	--expected_num_tokens 60000 \
	--max_num_tokens 60000 \
	--max_num_tokens_per_sample 60000 \
	--prefer_buffer_before 30000 \
	--num_shard=$NPROC_PER_NODE \
	--sharding_strategy="HYBRID_SHARD" \
	--wandb_project "zebra-cot" \
	--wandb_name "h200-zebra-cot-$(date +%Y%m%d_%H%M%S)" \
	--save_every 50 \
	--warmup_steps 50 \
	--total_steps 5000 \
	--results_dir results/ \
	--checkpoint_dir results/checkpoints/ > run.out 2> run.err

	# --cpu_offload True \