# config_fixed.yaml # Configuração para treinar o Lunaris "Base" ~600M com 20B tokens # --- Configuração do Modelo (~600M de parâmetros) --- model: vocab_size: 65536 d_model: 1024 n_layers: 48 n_heads: 16 n_kv_heads: 4 # GQA habilitado max_seq_len: 4096 dropout: 0.0 ffn_hidden_multiplier: 4.0 multiple_of: 256 rope_theta: 10000.0 # Parâmetro RoPE (padrão do modelo) # --- Configuração dos Dados --- data_dir: "data/ultrafineweb_20b_sharded/" # --- Configuração do Otimizador --- learning_rate: 3e-4 weight_decay: 0.1 beta1: 0.9 beta2: 0.95 # --- Agendador de Learning Rate --- # Batch global: 8 (bs) * 16 (accum) * 4096 (seqlen) = 524,288 # max_steps = 20B / 524,288 = ~38,147 max_steps: 38500 warmup_steps: 2000 # --- Configuração do Treinamento --- batch_size: 8 gradient_accumulation_steps: 16 grad_clip: 1.0 device: "cuda" compile_model: true # --- I/O e Logging --- out_dir: "checkpoints-20b/lunaris-600m-20b-base" log_interval: 20 save_interval: 1000 # --- Weights & Biases --- wandb_project: "lunaris-codex-pretrain" wandb_entity: null wandb_run_name: "lunaris-600m-20b-base"