Text Generation
English
meryyllebr543 commited on
Commit
235edb8
·
verified ·
1 Parent(s): 7da1123

Upload config.yaml

Browse files
Files changed (1) hide show
  1. config.yaml +47 -0
config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config_fixed.yaml
2
+ # Configuração para treinar o Lunaris "Base" ~600M com 20B tokens
3
+
4
+ # --- Configuração do Modelo (~600M de parâmetros) ---
5
+ model:
6
+ vocab_size: 65536
7
+ d_model: 1024
8
+ n_layers: 48
9
+ n_heads: 16
10
+ n_kv_heads: 4 # GQA habilitado
11
+ max_seq_len: 4096
12
+ dropout: 0.0
13
+ ffn_hidden_multiplier: 4.0
14
+ multiple_of: 256
15
+ rope_theta: 10000.0 # Parâmetro RoPE (padrão do modelo)
16
+
17
+ # --- Configuração dos Dados ---
18
+ data_dir: "data/ultrafineweb_20b_sharded/"
19
+
20
+ # --- Configuração do Otimizador ---
21
+ learning_rate: 3e-4
22
+ weight_decay: 0.1
23
+ beta1: 0.9
24
+ beta2: 0.95
25
+
26
+ # --- Agendador de Learning Rate ---
27
+ # Batch global: 8 (bs) * 16 (accum) * 4096 (seqlen) = 524,288
28
+ # max_steps = 20B / 524,288 = ~38,147
29
+ max_steps: 38500
30
+ warmup_steps: 2000
31
+
32
+ # --- Configuração do Treinamento ---
33
+ batch_size: 8
34
+ gradient_accumulation_steps: 16
35
+ grad_clip: 1.0
36
+ device: "cuda"
37
+ compile_model: true
38
+
39
+ # --- I/O e Logging ---
40
+ out_dir: "checkpoints-20b/lunaris-600m-20b-base"
41
+ log_interval: 20
42
+ save_interval: 1000
43
+
44
+ # --- Weights & Biases ---
45
+ wandb_project: "lunaris-codex-pretrain"
46
+ wandb_entity: null
47
+ wandb_run_name: "lunaris-600m-20b-base"