Text Generation
English
Lunaris-0.6B-base / config.yaml
meryyllebr543's picture
Upload config.yaml
235edb8 verified
# config_fixed.yaml
# Configuração para treinar o Lunaris "Base" ~600M com 20B tokens
# --- Configuração do Modelo (~600M de parâmetros) ---
model:
vocab_size: 65536
d_model: 1024
n_layers: 48
n_heads: 16
n_kv_heads: 4 # GQA habilitado
max_seq_len: 4096
dropout: 0.0
ffn_hidden_multiplier: 4.0
multiple_of: 256
rope_theta: 10000.0 # Parâmetro RoPE (padrão do modelo)
# --- Configuração dos Dados ---
data_dir: "data/ultrafineweb_20b_sharded/"
# --- Configuração do Otimizador ---
learning_rate: 3e-4
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
# --- Agendador de Learning Rate ---
# Batch global: 8 (bs) * 16 (accum) * 4096 (seqlen) = 524,288
# max_steps = 20B / 524,288 = ~38,147
max_steps: 38500
warmup_steps: 2000
# --- Configuração do Treinamento ---
batch_size: 8
gradient_accumulation_steps: 16
grad_clip: 1.0
device: "cuda"
compile_model: true
# --- I/O e Logging ---
out_dir: "checkpoints-20b/lunaris-600m-20b-base"
log_interval: 20
save_interval: 1000
# --- Weights & Biases ---
wandb_project: "lunaris-codex-pretrain"
wandb_entity: null
wandb_run_name: "lunaris-600m-20b-base"