|
|
|
|
|
|
|
|
|
|
|
|
|
|
model: |
|
|
vocab_size: 65536 |
|
|
d_model: 1024 |
|
|
n_layers: 48 |
|
|
n_heads: 16 |
|
|
n_kv_heads: 4 |
|
|
max_seq_len: 4096 |
|
|
dropout: 0.0 |
|
|
ffn_hidden_multiplier: 4.0 |
|
|
multiple_of: 256 |
|
|
rope_theta: 10000.0 |
|
|
|
|
|
|
|
|
data_dir: "data/ultrafineweb_20b_sharded/" |
|
|
|
|
|
|
|
|
learning_rate: 3e-4 |
|
|
weight_decay: 0.1 |
|
|
beta1: 0.9 |
|
|
beta2: 0.95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
max_steps: 38500 |
|
|
warmup_steps: 2000 |
|
|
|
|
|
|
|
|
batch_size: 8 |
|
|
gradient_accumulation_steps: 16 |
|
|
grad_clip: 1.0 |
|
|
device: "cuda" |
|
|
compile_model: true |
|
|
|
|
|
|
|
|
out_dir: "checkpoints-20b/lunaris-600m-20b-base" |
|
|
log_interval: 20 |
|
|
save_interval: 1000 |
|
|
|
|
|
|
|
|
wandb_project: "lunaris-codex-pretrain" |
|
|
wandb_entity: null |
|
|
wandb_run_name: "lunaris-600m-20b-base" |
|
|
|