meryyllebr543
/

Lunaris-0.6B-base

Text Generation

Model card Files Files and versions

Lunaris-0.6B-base / config.yaml

meryyllebr543's picture

Upload config.yaml

235edb8 verified 5 months ago

history blame contribute delete

1.14 kB

	# config_fixed.yaml
	# Configuração para treinar o Lunaris "Base" ~600M com 20B tokens

	# --- Configuração do Modelo (~600M de parâmetros) ---
	model:
	vocab_size: 65536
	d_model: 1024
	n_layers: 48
	n_heads: 16
	n_kv_heads: 4 # GQA habilitado
	max_seq_len: 4096
	dropout: 0.0
	ffn_hidden_multiplier: 4.0
	multiple_of: 256
	rope_theta: 10000.0 # Parâmetro RoPE (padrão do modelo)

	# --- Configuração dos Dados ---
	data_dir: "data/ultrafineweb_20b_sharded/"

	# --- Configuração do Otimizador ---
	learning_rate: 3e-4
	weight_decay: 0.1
	beta1: 0.9
	beta2: 0.95

	# --- Agendador de Learning Rate ---
	# Batch global: 8 (bs) * 16 (accum) * 4096 (seqlen) = 524,288
	# max_steps = 20B / 524,288 = ~38,147
	max_steps: 38500
	warmup_steps: 2000

	# --- Configuração do Treinamento ---
	batch_size: 8
	gradient_accumulation_steps: 16
	grad_clip: 1.0
	device: "cuda"
	compile_model: true

	# --- I/O e Logging ---
	out_dir: "checkpoints-20b/lunaris-600m-20b-base"
	log_interval: 20
	save_interval: 1000

	# --- Weights & Biases ---
	wandb_project: "lunaris-codex-pretrain"
	wandb_entity: null
	wandb_run_name: "lunaris-600m-20b-base"