GaborMadarasz commited on
Commit
04e5209
·
verified ·
1 Parent(s): dc0a6ae

Upload HuMBERT model

Browse files

The first Hungarian modernBERT

config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "answerdotai/ModernBERT-base",
3
+ "architectures": [
4
+ "ModernBertForMaskedLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": null,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": null,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "reference_compile": true,
40
+ "repad_logits_with_grad": false,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.3",
46
+ "vocab_size": 52000
47
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12223fde4b5b7faa4707932cb33e904fdb96a8bdec71c6ee17364281b8ec9eb4
3
+ size 603655064
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2256c6848c20bc86005d7d5f232e62b8387cd260dd85250b5c4b9eacb36d91b0
3
+ size 14244
scheduler.ptrom ADDED
Binary file (1.06 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": true,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<|padding|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[PAD]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[MASK]",
53
+ "lstrip": true,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "clean_up_tokenization_spaces": true,
61
+ "cls_token": "[CLS]",
62
+ "extra_special_tokens": {},
63
+ "mask_token": "[MASK]",
64
+ "model_input_names": [
65
+ "input_ids",
66
+ "attention_mask"
67
+ ],
68
+ "model_max_length": 8192,
69
+ "pad_token": "[PAD]",
70
+ "sep_token": "[SEP]",
71
+ "tokenizer_class": "PreTrainedTokenizerFast",
72
+ "unk_token": "[UNK]"
73
+ }
trainer_state.json ADDED
@@ -0,0 +1,1437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.840968608856201,
3
+ "best_model_checkpoint": "/home/jovyan/work/jupytershared/madaraszg/hun_modernBERT-base/checkpoint-900000",
4
+ "epoch": 0.7064444611025793,
5
+ "eval_steps": 50000,
6
+ "global_step": 900000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0039246914505698845,
13
+ "grad_norm": 5.427852153778076,
14
+ "learning_rate": 0.0004998,
15
+ "loss": 12.7757,
16
+ "step": 5000
17
+ },
18
+ {
19
+ "epoch": 0.007849382901139769,
20
+ "grad_norm": 3.9721639156341553,
21
+ "learning_rate": 0.0004999952362977937,
22
+ "loss": 9.605,
23
+ "step": 10000
24
+ },
25
+ {
26
+ "epoch": 0.011774074351709655,
27
+ "grad_norm": 4.18765115737915,
28
+ "learning_rate": 0.0004999809491856918,
29
+ "loss": 8.2774,
30
+ "step": 15000
31
+ },
32
+ {
33
+ "epoch": 0.015698765802279538,
34
+ "grad_norm": 3.2071168422698975,
35
+ "learning_rate": 0.0004999571563624537,
36
+ "loss": 7.7417,
37
+ "step": 20000
38
+ },
39
+ {
40
+ "epoch": 0.019623457252849426,
41
+ "grad_norm": 3.8722894191741943,
42
+ "learning_rate": 0.0004999237996462354,
43
+ "loss": 7.5097,
44
+ "step": 25000
45
+ },
46
+ {
47
+ "epoch": 0.02354814870341931,
48
+ "grad_norm": 3.7934718132019043,
49
+ "learning_rate": 0.0004998809069899119,
50
+ "loss": 7.2104,
51
+ "step": 30000
52
+ },
53
+ {
54
+ "epoch": 0.027472840153989195,
55
+ "grad_norm": 3.849618673324585,
56
+ "learning_rate": 0.0004998285486553762,
57
+ "loss": 7.3994,
58
+ "step": 35000
59
+ },
60
+ {
61
+ "epoch": 0.031397531604559076,
62
+ "grad_norm": 4.657052516937256,
63
+ "learning_rate": 0.0004997666008298054,
64
+ "loss": 6.9989,
65
+ "step": 40000
66
+ },
67
+ {
68
+ "epoch": 0.035322223055128964,
69
+ "grad_norm": 4.1764092445373535,
70
+ "learning_rate": 0.0004996951230621116,
71
+ "loss": 6.8514,
72
+ "step": 45000
73
+ },
74
+ {
75
+ "epoch": 0.03924691450569885,
76
+ "grad_norm": 3.2545883655548096,
77
+ "learning_rate": 0.000499614118079557,
78
+ "loss": 6.7546,
79
+ "step": 50000
80
+ },
81
+ {
82
+ "epoch": 0.03924691450569885,
83
+ "eval_loss": 3.438565969467163,
84
+ "eval_runtime": 920.3232,
85
+ "eval_samples_per_second": 234.641,
86
+ "eval_steps_per_second": 29.331,
87
+ "step": 50000
88
+ },
89
+ {
90
+ "epoch": 0.04317160595626873,
91
+ "grad_norm": 2.773167610168457,
92
+ "learning_rate": 0.0004995236461452331,
93
+ "loss": 6.6936,
94
+ "step": 55000
95
+ },
96
+ {
97
+ "epoch": 0.04709629740683862,
98
+ "grad_norm": 3.7268900871276855,
99
+ "learning_rate": 0.0004994236230404189,
100
+ "loss": 6.6314,
101
+ "step": 60000
102
+ },
103
+ {
104
+ "epoch": 0.0510209888574085,
105
+ "grad_norm": 4.384599685668945,
106
+ "learning_rate": 0.0004993141554730779,
107
+ "loss": 6.5624,
108
+ "step": 65000
109
+ },
110
+ {
111
+ "epoch": 0.05494568030797839,
112
+ "grad_norm": 3.721292018890381,
113
+ "learning_rate": 0.0004991950913888839,
114
+ "loss": 6.5376,
115
+ "step": 70000
116
+ },
117
+ {
118
+ "epoch": 0.05887037175854827,
119
+ "grad_norm": 4.107857704162598,
120
+ "learning_rate": 0.0004990665724978456,
121
+ "loss": 6.4852,
122
+ "step": 75000
123
+ },
124
+ {
125
+ "epoch": 0.06279506320911815,
126
+ "grad_norm": 4.358203887939453,
127
+ "learning_rate": 0.0004989285579755796,
128
+ "loss": 6.4618,
129
+ "step": 80000
130
+ },
131
+ {
132
+ "epoch": 0.06671975465968805,
133
+ "grad_norm": 2.996066093444824,
134
+ "learning_rate": 0.0004987810835455134,
135
+ "loss": 6.4299,
136
+ "step": 85000
137
+ },
138
+ {
139
+ "epoch": 0.07064444611025793,
140
+ "grad_norm": 3.4833667278289795,
141
+ "learning_rate": 0.0004986240634463282,
142
+ "loss": 6.3824,
143
+ "step": 90000
144
+ },
145
+ {
146
+ "epoch": 0.07456913756082781,
147
+ "grad_norm": 4.706465721130371,
148
+ "learning_rate": 0.0004984575950481576,
149
+ "loss": 6.3747,
150
+ "step": 95000
151
+ },
152
+ {
153
+ "epoch": 0.0784938290113977,
154
+ "grad_norm": 5.1914215087890625,
155
+ "learning_rate": 0.0004982817626825089,
156
+ "loss": 6.3426,
157
+ "step": 100000
158
+ },
159
+ {
160
+ "epoch": 0.0784938290113977,
161
+ "eval_loss": 3.232980966567993,
162
+ "eval_runtime": 1042.7899,
163
+ "eval_samples_per_second": 207.085,
164
+ "eval_steps_per_second": 25.886,
165
+ "step": 100000
166
+ },
167
+ {
168
+ "epoch": 0.08241852046196758,
169
+ "grad_norm": 3.467543363571167,
170
+ "learning_rate": 0.0004980962857598535,
171
+ "loss": 6.323,
172
+ "step": 105000
173
+ },
174
+ {
175
+ "epoch": 0.08634321191253747,
176
+ "grad_norm": 4.3943400382995605,
177
+ "learning_rate": 0.000497901382561167,
178
+ "loss": 6.2856,
179
+ "step": 110000
180
+ },
181
+ {
182
+ "epoch": 0.09026790336310735,
183
+ "grad_norm": 5.44612979888916,
184
+ "learning_rate": 0.0004976970661964071,
185
+ "loss": 6.2661,
186
+ "step": 115000
187
+ },
188
+ {
189
+ "epoch": 0.09419259481367724,
190
+ "grad_norm": 4.370021820068359,
191
+ "learning_rate": 0.0004974833064126723,
192
+ "loss": 6.2631,
193
+ "step": 120000
194
+ },
195
+ {
196
+ "epoch": 0.09811728626424712,
197
+ "grad_norm": 5.689448833465576,
198
+ "learning_rate": 0.0004972601113595269,
199
+ "loss": 6.2489,
200
+ "step": 125000
201
+ },
202
+ {
203
+ "epoch": 0.102041977714817,
204
+ "grad_norm": 4.484179496765137,
205
+ "learning_rate": 0.0004970275370315128,
206
+ "loss": 6.2418,
207
+ "step": 130000
208
+ },
209
+ {
210
+ "epoch": 0.10596666916538688,
211
+ "grad_norm": 3.9610681533813477,
212
+ "learning_rate": 0.0004967854498415297,
213
+ "loss": 6.2117,
214
+ "step": 135000
215
+ },
216
+ {
217
+ "epoch": 0.10989136061595678,
218
+ "grad_norm": 3.5981478691101074,
219
+ "learning_rate": 0.0004965340014730877,
220
+ "loss": 6.1937,
221
+ "step": 140000
222
+ },
223
+ {
224
+ "epoch": 0.11381605206652666,
225
+ "grad_norm": 4.028958320617676,
226
+ "learning_rate": 0.0004962731540273645,
227
+ "loss": 6.185,
228
+ "step": 145000
229
+ },
230
+ {
231
+ "epoch": 0.11774074351709654,
232
+ "grad_norm": 3.569103479385376,
233
+ "learning_rate": 0.0004960029174491356,
234
+ "loss": 6.1823,
235
+ "step": 150000
236
+ },
237
+ {
238
+ "epoch": 0.11774074351709654,
239
+ "eval_loss": 3.1528232097625732,
240
+ "eval_runtime": 1097.5085,
241
+ "eval_samples_per_second": 196.76,
242
+ "eval_steps_per_second": 24.596,
243
+ "step": 150000
244
+ },
245
+ {
246
+ "epoch": 0.12166543496766644,
247
+ "grad_norm": 3.469241142272949,
248
+ "learning_rate": 0.0004957233020411361,
249
+ "loss": 6.167,
250
+ "step": 155000
251
+ },
252
+ {
253
+ "epoch": 0.1255901264182363,
254
+ "grad_norm": 5.208609580993652,
255
+ "learning_rate": 0.0004954343772197868,
256
+ "loss": 6.1673,
257
+ "step": 160000
258
+ },
259
+ {
260
+ "epoch": 0.1295148178688062,
261
+ "grad_norm": 4.649272918701172,
262
+ "learning_rate": 0.0004951359777341922,
263
+ "loss": 6.1443,
264
+ "step": 165000
265
+ },
266
+ {
267
+ "epoch": 0.1334395093193761,
268
+ "grad_norm": 4.373592376708984,
269
+ "learning_rate": 0.0004948284162183676,
270
+ "loss": 6.1392,
271
+ "step": 170000
272
+ },
273
+ {
274
+ "epoch": 0.13736420076994596,
275
+ "grad_norm": 4.671726703643799,
276
+ "learning_rate": 0.0004945113350346094,
277
+ "loss": 6.1305,
278
+ "step": 175000
279
+ },
280
+ {
281
+ "epoch": 0.14128889222051585,
282
+ "grad_norm": 3.430539846420288,
283
+ "learning_rate": 0.0004941849281568307,
284
+ "loss": 6.1348,
285
+ "step": 180000
286
+ },
287
+ {
288
+ "epoch": 0.14521358367108575,
289
+ "grad_norm": 4.936987400054932,
290
+ "learning_rate": 0.0004938494122969139,
291
+ "loss": 6.1163,
292
+ "step": 185000
293
+ },
294
+ {
295
+ "epoch": 0.14913827512165562,
296
+ "grad_norm": 7.4127984046936035,
297
+ "learning_rate": 0.0004935043273650284,
298
+ "loss": 6.1055,
299
+ "step": 190000
300
+ },
301
+ {
302
+ "epoch": 0.1530629665722255,
303
+ "grad_norm": 4.886425495147705,
304
+ "learning_rate": 0.0004931501668390127,
305
+ "loss": 6.1162,
306
+ "step": 195000
307
+ },
308
+ {
309
+ "epoch": 0.1569876580227954,
310
+ "grad_norm": 5.542958736419678,
311
+ "learning_rate": 0.000492786518985573,
312
+ "loss": 6.0902,
313
+ "step": 200000
314
+ },
315
+ {
316
+ "epoch": 0.1569876580227954,
317
+ "eval_loss": 3.106311321258545,
318
+ "eval_runtime": 1143.7295,
319
+ "eval_samples_per_second": 188.809,
320
+ "eval_steps_per_second": 23.602,
321
+ "step": 200000
322
+ },
323
+ {
324
+ "epoch": 0.16091234947336527,
325
+ "grad_norm": 5.0378923416137695,
326
+ "learning_rate": 0.0004924137585324416,
327
+ "loss": 6.0822,
328
+ "step": 205000
329
+ },
330
+ {
331
+ "epoch": 0.16483704092393517,
332
+ "grad_norm": 3.9391469955444336,
333
+ "learning_rate": 0.0004920318334402453,
334
+ "loss": 6.0806,
335
+ "step": 210000
336
+ },
337
+ {
338
+ "epoch": 0.16876173237450504,
339
+ "grad_norm": 4.512287139892578,
340
+ "learning_rate": 0.0004916405262093358,
341
+ "loss": 6.0895,
342
+ "step": 215000
343
+ },
344
+ {
345
+ "epoch": 0.17268642382507493,
346
+ "grad_norm": 4.639518737792969,
347
+ "learning_rate": 0.0004912400838187446,
348
+ "loss": 6.0726,
349
+ "step": 220000
350
+ },
351
+ {
352
+ "epoch": 0.17661111527564483,
353
+ "grad_norm": 4.6264190673828125,
354
+ "learning_rate": 0.0004908306928184586,
355
+ "loss": 6.075,
356
+ "step": 225000
357
+ },
358
+ {
359
+ "epoch": 0.1805358067262147,
360
+ "grad_norm": 4.821324348449707,
361
+ "learning_rate": 0.000490411707628039,
362
+ "loss": 6.0763,
363
+ "step": 230000
364
+ },
365
+ {
366
+ "epoch": 0.1844604981767846,
367
+ "grad_norm": 4.997890949249268,
368
+ "learning_rate": 0.0004899836359827696,
369
+ "loss": 6.0439,
370
+ "step": 235000
371
+ },
372
+ {
373
+ "epoch": 0.18838518962735448,
374
+ "grad_norm": 5.442014694213867,
375
+ "learning_rate": 0.0004895464997071264,
376
+ "loss": 6.048,
377
+ "step": 240000
378
+ },
379
+ {
380
+ "epoch": 0.19230988107792435,
381
+ "grad_norm": 4.34916353225708,
382
+ "learning_rate": 0.0004891002307518132,
383
+ "loss": 6.059,
384
+ "step": 245000
385
+ },
386
+ {
387
+ "epoch": 0.19623457252849424,
388
+ "grad_norm": 4.754409313201904,
389
+ "learning_rate": 0.0004886448461307771,
390
+ "loss": 6.0483,
391
+ "step": 250000
392
+ },
393
+ {
394
+ "epoch": 0.19623457252849424,
395
+ "eval_loss": 3.0871126651763916,
396
+ "eval_runtime": 1102.0863,
397
+ "eval_samples_per_second": 195.943,
398
+ "eval_steps_per_second": 24.494,
399
+ "step": 250000
400
+ },
401
+ {
402
+ "epoch": 0.20015926397906414,
403
+ "grad_norm": 7.54548978805542,
404
+ "learning_rate": 0.0004881803632054984,
405
+ "loss": 6.0235,
406
+ "step": 255000
407
+ },
408
+ {
409
+ "epoch": 0.204083955429634,
410
+ "grad_norm": 6.219805717468262,
411
+ "learning_rate": 0.00048770689534197696,
412
+ "loss": 6.0259,
413
+ "step": 260000
414
+ },
415
+ {
416
+ "epoch": 0.2080086468802039,
417
+ "grad_norm": 4.785896301269531,
418
+ "learning_rate": 0.00048722417362181855,
419
+ "loss": 6.0152,
420
+ "step": 265000
421
+ },
422
+ {
423
+ "epoch": 0.21193333833077377,
424
+ "grad_norm": 6.293859958648682,
425
+ "learning_rate": 0.000486732602694764,
426
+ "loss": 6.014,
427
+ "step": 270000
428
+ },
429
+ {
430
+ "epoch": 0.21585802978134366,
431
+ "grad_norm": 5.359744548797607,
432
+ "learning_rate": 0.0004862318078178062,
433
+ "loss": 6.0129,
434
+ "step": 275000
435
+ },
436
+ {
437
+ "epoch": 0.21978272123191356,
438
+ "grad_norm": 4.500082015991211,
439
+ "learning_rate": 0.00048572210591012405,
440
+ "loss": 6.0007,
441
+ "step": 280000
442
+ },
443
+ {
444
+ "epoch": 0.22370741268248343,
445
+ "grad_norm": 5.4786906242370605,
446
+ "learning_rate": 0.00048520341712729654,
447
+ "loss": 5.9918,
448
+ "step": 285000
449
+ },
450
+ {
451
+ "epoch": 0.22763210413305332,
452
+ "grad_norm": 5.049560546875,
453
+ "learning_rate": 0.0004846758677132269,
454
+ "loss": 6.0041,
455
+ "step": 290000
456
+ },
457
+ {
458
+ "epoch": 0.23155679558362322,
459
+ "grad_norm": 4.548033714294434,
460
+ "learning_rate": 0.00048413915837784905,
461
+ "loss": 6.009,
462
+ "step": 295000
463
+ },
464
+ {
465
+ "epoch": 0.23548148703419308,
466
+ "grad_norm": 5.688722133636475,
467
+ "learning_rate": 0.00048359362898595776,
468
+ "loss": 5.9754,
469
+ "step": 300000
470
+ },
471
+ {
472
+ "epoch": 0.23548148703419308,
473
+ "eval_loss": 3.070676803588867,
474
+ "eval_runtime": 1156.3208,
475
+ "eval_samples_per_second": 186.753,
476
+ "eval_steps_per_second": 23.345,
477
+ "step": 300000
478
+ },
479
+ {
480
+ "epoch": 0.23940617848476298,
481
+ "grad_norm": 3.3441436290740967,
482
+ "learning_rate": 0.0004830391938668317,
483
+ "loss": 5.9819,
484
+ "step": 305000
485
+ },
486
+ {
487
+ "epoch": 0.24333086993533287,
488
+ "grad_norm": 4.076082229614258,
489
+ "learning_rate": 0.0004824759877544745,
490
+ "loss": 5.9817,
491
+ "step": 310000
492
+ },
493
+ {
494
+ "epoch": 0.24725556138590274,
495
+ "grad_norm": 4.681301593780518,
496
+ "learning_rate": 0.00048190380670406807,
497
+ "loss": 5.9629,
498
+ "step": 315000
499
+ },
500
+ {
501
+ "epoch": 0.2511802528364726,
502
+ "grad_norm": 5.435401439666748,
503
+ "learning_rate": 0.0004813229014844921,
504
+ "loss": 5.9558,
505
+ "step": 320000
506
+ },
507
+ {
508
+ "epoch": 0.2551049442870425,
509
+ "grad_norm": 5.168973922729492,
510
+ "learning_rate": 0.0004807330617418338,
511
+ "loss": 5.9585,
512
+ "step": 325000
513
+ },
514
+ {
515
+ "epoch": 0.2590296357376124,
516
+ "grad_norm": 3.879523754119873,
517
+ "learning_rate": 0.0004801344253307013,
518
+ "loss": 5.9367,
519
+ "step": 330000
520
+ },
521
+ {
522
+ "epoch": 0.2629543271881823,
523
+ "grad_norm": 3.8004238605499268,
524
+ "learning_rate": 0.00047952689266699537,
525
+ "loss": 5.9508,
526
+ "step": 335000
527
+ },
528
+ {
529
+ "epoch": 0.2668790186387522,
530
+ "grad_norm": 3.6730406284332275,
531
+ "learning_rate": 0.0004789107299737269,
532
+ "loss": 5.945,
533
+ "step": 340000
534
+ },
535
+ {
536
+ "epoch": 0.270803710089322,
537
+ "grad_norm": 5.348697662353516,
538
+ "learning_rate": 0.00047828621778500213,
539
+ "loss": 5.9411,
540
+ "step": 345000
541
+ },
542
+ {
543
+ "epoch": 0.2747284015398919,
544
+ "grad_norm": 6.138799667358398,
545
+ "learning_rate": 0.00047765250211029227,
546
+ "loss": 5.9372,
547
+ "step": 350000
548
+ },
549
+ {
550
+ "epoch": 0.2747284015398919,
551
+ "eval_loss": 3.0503780841827393,
552
+ "eval_runtime": 1159.4067,
553
+ "eval_samples_per_second": 186.256,
554
+ "eval_steps_per_second": 23.283,
555
+ "step": 350000
556
+ },
557
+ {
558
+ "epoch": 0.2786530929904618,
559
+ "grad_norm": 5.234776020050049,
560
+ "learning_rate": 0.0004770101037396353,
561
+ "loss": 5.9323,
562
+ "step": 355000
563
+ },
564
+ {
565
+ "epoch": 0.2825777844410317,
566
+ "grad_norm": 5.453892707824707,
567
+ "learning_rate": 0.00047635904717418853,
568
+ "loss": 5.9306,
569
+ "step": 360000
570
+ },
571
+ {
572
+ "epoch": 0.2865024758916016,
573
+ "grad_norm": 5.497768402099609,
574
+ "learning_rate": 0.00047569935724533363,
575
+ "loss": 5.9311,
576
+ "step": 365000
577
+ },
578
+ {
579
+ "epoch": 0.2904271673421715,
580
+ "grad_norm": 5.382653713226318,
581
+ "learning_rate": 0.0004750313282043671,
582
+ "loss": 5.9188,
583
+ "step": 370000
584
+ },
585
+ {
586
+ "epoch": 0.29435185879274134,
587
+ "grad_norm": 3.7600419521331787,
588
+ "learning_rate": 0.0004743548595639592,
589
+ "loss": 5.9109,
590
+ "step": 375000
591
+ },
592
+ {
593
+ "epoch": 0.29827655024331123,
594
+ "grad_norm": 6.006251811981201,
595
+ "learning_rate": 0.00047366929239918046,
596
+ "loss": 5.9207,
597
+ "step": 380000
598
+ },
599
+ {
600
+ "epoch": 0.30220124169388113,
601
+ "grad_norm": 6.547440052032471,
602
+ "learning_rate": 0.0004729753307170254,
603
+ "loss": 5.9264,
604
+ "step": 385000
605
+ },
606
+ {
607
+ "epoch": 0.306125933144451,
608
+ "grad_norm": 5.451165199279785,
609
+ "learning_rate": 0.0004722730060967992,
610
+ "loss": 5.9077,
611
+ "step": 390000
612
+ },
613
+ {
614
+ "epoch": 0.3100506245950209,
615
+ "grad_norm": 6.919593811035156,
616
+ "learning_rate": 0.0004715620642838824,
617
+ "loss": 5.9068,
618
+ "step": 395000
619
+ },
620
+ {
621
+ "epoch": 0.3139753160455908,
622
+ "grad_norm": 5.905742645263672,
623
+ "learning_rate": 0.00047084281681554897,
624
+ "loss": 5.9082,
625
+ "step": 400000
626
+ },
627
+ {
628
+ "epoch": 0.3139753160455908,
629
+ "eval_loss": 3.0186421871185303,
630
+ "eval_runtime": 1163.0768,
631
+ "eval_samples_per_second": 185.668,
632
+ "eval_steps_per_second": 23.209,
633
+ "step": 400000
634
+ },
635
+ {
636
+ "epoch": 0.31790000749616065,
637
+ "grad_norm": 7.0828986167907715,
638
+ "learning_rate": 0.00047011514974306426,
639
+ "loss": 5.9015,
640
+ "step": 405000
641
+ },
642
+ {
643
+ "epoch": 0.32182469894673055,
644
+ "grad_norm": 5.617277145385742,
645
+ "learning_rate": 0.0004693792389168259,
646
+ "loss": 5.8899,
647
+ "step": 410000
648
+ },
649
+ {
650
+ "epoch": 0.32574939039730044,
651
+ "grad_norm": 5.834865093231201,
652
+ "learning_rate": 0.0004686349676316719,
653
+ "loss": 5.9025,
654
+ "step": 415000
655
+ },
656
+ {
657
+ "epoch": 0.32967408184787034,
658
+ "grad_norm": 4.863770961761475,
659
+ "learning_rate": 0.00046788190992146675,
660
+ "loss": 5.8755,
661
+ "step": 420000
662
+ },
663
+ {
664
+ "epoch": 0.33359877329844023,
665
+ "grad_norm": 4.0446391105651855,
666
+ "learning_rate": 0.0004671209981513359,
667
+ "loss": 5.8738,
668
+ "step": 425000
669
+ },
670
+ {
671
+ "epoch": 0.33752346474901007,
672
+ "grad_norm": 4.7902045249938965,
673
+ "learning_rate": 0.0004663516572272386,
674
+ "loss": 5.8761,
675
+ "step": 430000
676
+ },
677
+ {
678
+ "epoch": 0.34144815619957997,
679
+ "grad_norm": 3.621206283569336,
680
+ "learning_rate": 0.00046557406792387514,
681
+ "loss": 5.877,
682
+ "step": 435000
683
+ },
684
+ {
685
+ "epoch": 0.34537284765014986,
686
+ "grad_norm": 4.824281692504883,
687
+ "learning_rate": 0.0004647881018418675,
688
+ "loss": 5.8776,
689
+ "step": 440000
690
+ },
691
+ {
692
+ "epoch": 0.34929753910071976,
693
+ "grad_norm": 5.029583930969238,
694
+ "learning_rate": 0.0004639942630746584,
695
+ "loss": 5.8648,
696
+ "step": 445000
697
+ },
698
+ {
699
+ "epoch": 0.35322223055128965,
700
+ "grad_norm": 4.80470609664917,
701
+ "learning_rate": 0.0004631924303768197,
702
+ "loss": 5.8674,
703
+ "step": 450000
704
+ },
705
+ {
706
+ "epoch": 0.35322223055128965,
707
+ "eval_loss": 3.011624336242676,
708
+ "eval_runtime": 1103.4516,
709
+ "eval_samples_per_second": 195.7,
710
+ "eval_steps_per_second": 24.463,
711
+ "step": 450000
712
+ },
713
+ {
714
+ "epoch": 0.35714692200185955,
715
+ "grad_norm": 4.197587490081787,
716
+ "learning_rate": 0.0004623819874532173,
717
+ "loss": 5.8754,
718
+ "step": 455000
719
+ },
720
+ {
721
+ "epoch": 0.3610716134524294,
722
+ "grad_norm": 4.719716548919678,
723
+ "learning_rate": 0.0004615632797034139,
724
+ "loss": 5.8609,
725
+ "step": 460000
726
+ },
727
+ {
728
+ "epoch": 0.3649963049029993,
729
+ "grad_norm": 5.633116245269775,
730
+ "learning_rate": 0.00046073683199574194,
731
+ "loss": 5.8661,
732
+ "step": 465000
733
+ },
734
+ {
735
+ "epoch": 0.3689209963535692,
736
+ "grad_norm": 4.527806758880615,
737
+ "learning_rate": 0.00045990218220831147,
738
+ "loss": 5.8481,
739
+ "step": 470000
740
+ },
741
+ {
742
+ "epoch": 0.37284568780413907,
743
+ "grad_norm": 7.49449348449707,
744
+ "learning_rate": 0.0004590596960876785,
745
+ "loss": 5.8393,
746
+ "step": 475000
747
+ },
748
+ {
749
+ "epoch": 0.37677037925470896,
750
+ "grad_norm": 5.242143630981445,
751
+ "learning_rate": 0.00045820958150135304,
752
+ "loss": 5.854,
753
+ "step": 480000
754
+ },
755
+ {
756
+ "epoch": 0.3806950707052788,
757
+ "grad_norm": 4.846193313598633,
758
+ "learning_rate": 0.0004573510176867819,
759
+ "loss": 5.8465,
760
+ "step": 485000
761
+ },
762
+ {
763
+ "epoch": 0.3846197621558487,
764
+ "grad_norm": 4.976168632507324,
765
+ "learning_rate": 0.00045648471959526093,
766
+ "loss": 5.8378,
767
+ "step": 490000
768
+ },
769
+ {
770
+ "epoch": 0.3885444536064186,
771
+ "grad_norm": 5.064621448516846,
772
+ "learning_rate": 0.00045561054930020917,
773
+ "loss": 5.8489,
774
+ "step": 495000
775
+ },
776
+ {
777
+ "epoch": 0.3924691450569885,
778
+ "grad_norm": 7.152626991271973,
779
+ "learning_rate": 0.0004547285401292574,
780
+ "loss": 5.8365,
781
+ "step": 500000
782
+ },
783
+ {
784
+ "epoch": 0.3924691450569885,
785
+ "eval_loss": 2.987938642501831,
786
+ "eval_runtime": 1182.504,
787
+ "eval_samples_per_second": 182.618,
788
+ "eval_steps_per_second": 22.828,
789
+ "step": 500000
790
+ },
791
+ {
792
+ "epoch": 0.3963938365075584,
793
+ "grad_norm": 5.304065227508545,
794
+ "learning_rate": 0.0004538385468960594,
795
+ "loss": 5.8472,
796
+ "step": 505000
797
+ },
798
+ {
799
+ "epoch": 0.4003185279581283,
800
+ "grad_norm": 5.780062198638916,
801
+ "learning_rate": 0.00045294077923425393,
802
+ "loss": 5.8492,
803
+ "step": 510000
804
+ },
805
+ {
806
+ "epoch": 0.4042432194086981,
807
+ "grad_norm": 5.706847667694092,
808
+ "learning_rate": 0.0004520356352035454,
809
+ "loss": 5.8333,
810
+ "step": 515000
811
+ },
812
+ {
813
+ "epoch": 0.408167910859268,
814
+ "grad_norm": 5.198765277862549,
815
+ "learning_rate": 0.00045112297511828384,
816
+ "loss": 5.827,
817
+ "step": 520000
818
+ },
819
+ {
820
+ "epoch": 0.4120926023098379,
821
+ "grad_norm": 5.543708324432373,
822
+ "learning_rate": 0.0004502019134736622,
823
+ "loss": 5.8469,
824
+ "step": 525000
825
+ },
826
+ {
827
+ "epoch": 0.4160172937604078,
828
+ "grad_norm": 6.349376678466797,
829
+ "learning_rate": 0.00044927339954835976,
830
+ "loss": 5.8168,
831
+ "step": 530000
832
+ },
833
+ {
834
+ "epoch": 0.4199419852109777,
835
+ "grad_norm": 5.693711280822754,
836
+ "learning_rate": 0.0004483376613453707,
837
+ "loss": 5.8331,
838
+ "step": 535000
839
+ },
840
+ {
841
+ "epoch": 0.42386667666154754,
842
+ "grad_norm": 4.938353538513184,
843
+ "learning_rate": 0.0004473947436224123,
844
+ "loss": 5.8324,
845
+ "step": 540000
846
+ },
847
+ {
848
+ "epoch": 0.42779136811211743,
849
+ "grad_norm": 4.152617454528809,
850
+ "learning_rate": 0.0004464433541422548,
851
+ "loss": 5.8357,
852
+ "step": 545000
853
+ },
854
+ {
855
+ "epoch": 0.4317160595626873,
856
+ "grad_norm": 6.794117450714111,
857
+ "learning_rate": 0.0004454844692892605,
858
+ "loss": 5.8409,
859
+ "step": 550000
860
+ },
861
+ {
862
+ "epoch": 0.4317160595626873,
863
+ "eval_loss": 2.996899127960205,
864
+ "eval_runtime": 1172.5997,
865
+ "eval_samples_per_second": 184.16,
866
+ "eval_steps_per_second": 23.021,
867
+ "step": 550000
868
+ },
869
+ {
870
+ "epoch": 0.4356407510132572,
871
+ "grad_norm": 6.325488567352295,
872
+ "learning_rate": 0.0004445189016962925,
873
+ "loss": 5.8321,
874
+ "step": 555000
875
+ },
876
+ {
877
+ "epoch": 0.4395654424638271,
878
+ "grad_norm": 4.354986667633057,
879
+ "learning_rate": 0.0004435453375563152,
880
+ "loss": 5.8083,
881
+ "step": 560000
882
+ },
883
+ {
884
+ "epoch": 0.443490133914397,
885
+ "grad_norm": 6.791714191436768,
886
+ "learning_rate": 0.00044256458853620686,
887
+ "loss": 5.8181,
888
+ "step": 565000
889
+ },
890
+ {
891
+ "epoch": 0.44741482536496685,
892
+ "grad_norm": 5.404956817626953,
893
+ "learning_rate": 0.00044157629958651906,
894
+ "loss": 5.8209,
895
+ "step": 570000
896
+ },
897
+ {
898
+ "epoch": 0.45133951681553675,
899
+ "grad_norm": 6.600470066070557,
900
+ "learning_rate": 0.00044058110366587233,
901
+ "loss": 5.8144,
902
+ "step": 575000
903
+ },
904
+ {
905
+ "epoch": 0.45526420826610664,
906
+ "grad_norm": 7.110249996185303,
907
+ "learning_rate": 0.00043957844343575327,
908
+ "loss": 5.8177,
909
+ "step": 580000
910
+ },
911
+ {
912
+ "epoch": 0.45918889971667654,
913
+ "grad_norm": 4.988192081451416,
914
+ "learning_rate": 0.00043856896110892765,
915
+ "loss": 5.8048,
916
+ "step": 585000
917
+ },
918
+ {
919
+ "epoch": 0.46311359116724643,
920
+ "grad_norm": 5.140357494354248,
921
+ "learning_rate": 0.00043755168272890363,
922
+ "loss": 5.8167,
923
+ "step": 590000
924
+ },
925
+ {
926
+ "epoch": 0.46703828261781627,
927
+ "grad_norm": 7.784915447235107,
928
+ "learning_rate": 0.00043652786802263427,
929
+ "loss": 5.8068,
930
+ "step": 595000
931
+ },
932
+ {
933
+ "epoch": 0.47096297406838616,
934
+ "grad_norm": 5.686777591705322,
935
+ "learning_rate": 0.000435496326481159,
936
+ "loss": 5.8049,
937
+ "step": 600000
938
+ },
939
+ {
940
+ "epoch": 0.47096297406838616,
941
+ "eval_loss": 2.9820611476898193,
942
+ "eval_runtime": 1175.9282,
943
+ "eval_samples_per_second": 183.639,
944
+ "eval_steps_per_second": 22.955,
945
+ "step": 600000
946
+ },
947
+ {
948
+ "epoch": 0.47488766551895606,
949
+ "grad_norm": 4.776956081390381,
950
+ "learning_rate": 0.00043445771008574104,
951
+ "loss": 5.7853,
952
+ "step": 605000
953
+ },
954
+ {
955
+ "epoch": 0.47881235696952595,
956
+ "grad_norm": 5.395595073699951,
957
+ "learning_rate": 0.00043341205844934426,
958
+ "loss": 5.7943,
959
+ "step": 610000
960
+ },
961
+ {
962
+ "epoch": 0.48273704842009585,
963
+ "grad_norm": 6.324551582336426,
964
+ "learning_rate": 0.0004323598339897348,
965
+ "loss": 5.7885,
966
+ "step": 615000
967
+ },
968
+ {
969
+ "epoch": 0.48666173987066574,
970
+ "grad_norm": 8.08753776550293,
971
+ "learning_rate": 0.0004313002345566292,
972
+ "loss": 5.7802,
973
+ "step": 620000
974
+ },
975
+ {
976
+ "epoch": 0.4905864313212356,
977
+ "grad_norm": 4.667470932006836,
978
+ "learning_rate": 0.0004302343624111244,
979
+ "loss": 5.7936,
980
+ "step": 625000
981
+ },
982
+ {
983
+ "epoch": 0.4945111227718055,
984
+ "grad_norm": 4.392518043518066,
985
+ "learning_rate": 0.00042916076273439863,
986
+ "loss": 5.7818,
987
+ "step": 630000
988
+ },
989
+ {
990
+ "epoch": 0.4984358142223754,
991
+ "grad_norm": 5.52575159072876,
992
+ "learning_rate": 0.00042808076064426405,
993
+ "loss": 5.7728,
994
+ "step": 635000
995
+ },
996
+ {
997
+ "epoch": 0.5023605056729452,
998
+ "grad_norm": 5.269731521606445,
999
+ "learning_rate": 0.0004269941873694671,
1000
+ "loss": 5.7826,
1001
+ "step": 640000
1002
+ },
1003
+ {
1004
+ "epoch": 0.5062851971235152,
1005
+ "grad_norm": 4.810575008392334,
1006
+ "learning_rate": 0.00042590064944661286,
1007
+ "loss": 5.7781,
1008
+ "step": 645000
1009
+ },
1010
+ {
1011
+ "epoch": 0.510209888574085,
1012
+ "grad_norm": 4.98996114730835,
1013
+ "learning_rate": 0.0004248001845288868,
1014
+ "loss": 5.7747,
1015
+ "step": 650000
1016
+ },
1017
+ {
1018
+ "epoch": 0.510209888574085,
1019
+ "eval_loss": 2.961513042449951,
1020
+ "eval_runtime": 1127.3033,
1021
+ "eval_samples_per_second": 191.56,
1022
+ "eval_steps_per_second": 23.946,
1023
+ "step": 650000
1024
+ },
1025
+ {
1026
+ "epoch": 0.514134580024655,
1027
+ "grad_norm": 7.207222938537598,
1028
+ "learning_rate": 0.00042369371911227634,
1029
+ "loss": 5.7746,
1030
+ "step": 655000
1031
+ },
1032
+ {
1033
+ "epoch": 0.5180592714752248,
1034
+ "grad_norm": 5.494042873382568,
1035
+ "learning_rate": 0.0004225801900348928,
1036
+ "loss": 5.7699,
1037
+ "step": 660000
1038
+ },
1039
+ {
1040
+ "epoch": 0.5219839629257946,
1041
+ "grad_norm": 6.930171012878418,
1042
+ "learning_rate": 0.00042145985659294296,
1043
+ "loss": 5.7644,
1044
+ "step": 665000
1045
+ },
1046
+ {
1047
+ "epoch": 0.5259086543763646,
1048
+ "grad_norm": 4.1922760009765625,
1049
+ "learning_rate": 0.00042033320972059365,
1050
+ "loss": 5.7637,
1051
+ "step": 670000
1052
+ },
1053
+ {
1054
+ "epoch": 0.5298333458269344,
1055
+ "grad_norm": 5.89008092880249,
1056
+ "learning_rate": 0.00041920006891640475,
1057
+ "loss": 5.7521,
1058
+ "step": 675000
1059
+ },
1060
+ {
1061
+ "epoch": 0.5337580372775044,
1062
+ "grad_norm": 5.436644077301025,
1063
+ "learning_rate": 0.00041806047738122455,
1064
+ "loss": 5.7454,
1065
+ "step": 680000
1066
+ },
1067
+ {
1068
+ "epoch": 0.5376827287280742,
1069
+ "grad_norm": 6.073439121246338,
1070
+ "learning_rate": 0.00041691470849126257,
1071
+ "loss": 5.752,
1072
+ "step": 685000
1073
+ },
1074
+ {
1075
+ "epoch": 0.541607420178644,
1076
+ "grad_norm": 5.298029899597168,
1077
+ "learning_rate": 0.0004157623473475629,
1078
+ "loss": 5.748,
1079
+ "step": 690000
1080
+ },
1081
+ {
1082
+ "epoch": 0.545532111629214,
1083
+ "grad_norm": 16.55943489074707,
1084
+ "learning_rate": 0.00041460389899361316,
1085
+ "loss": 5.7357,
1086
+ "step": 695000
1087
+ },
1088
+ {
1089
+ "epoch": 0.5494568030797838,
1090
+ "grad_norm": 6.340664863586426,
1091
+ "learning_rate": 0.00041343871022983314,
1092
+ "loss": 5.7385,
1093
+ "step": 700000
1094
+ },
1095
+ {
1096
+ "epoch": 0.5494568030797838,
1097
+ "eval_loss": 2.939816951751709,
1098
+ "eval_runtime": 1153.8155,
1099
+ "eval_samples_per_second": 187.158,
1100
+ "eval_steps_per_second": 23.395,
1101
+ "step": 700000
1102
+ },
1103
+ {
1104
+ "epoch": 0.5533814945303538,
1105
+ "grad_norm": 5.492395877838135,
1106
+ "learning_rate": 0.0004122675228441709,
1107
+ "loss": 5.7247,
1108
+ "step": 705000
1109
+ },
1110
+ {
1111
+ "epoch": 0.5573061859809236,
1112
+ "grad_norm": 5.045653820037842,
1113
+ "learning_rate": 0.000411090385214718,
1114
+ "loss": 5.7244,
1115
+ "step": 710000
1116
+ },
1117
+ {
1118
+ "epoch": 0.5612308774314936,
1119
+ "grad_norm": 5.396609783172607,
1120
+ "learning_rate": 0.00040990663367439613,
1121
+ "loss": 5.7248,
1122
+ "step": 715000
1123
+ },
1124
+ {
1125
+ "epoch": 0.5651555688820634,
1126
+ "grad_norm": 5.786918640136719,
1127
+ "learning_rate": 0.0004087174991472511,
1128
+ "loss": 5.719,
1129
+ "step": 720000
1130
+ },
1131
+ {
1132
+ "epoch": 0.5690802603326333,
1133
+ "grad_norm": 6.569219589233398,
1134
+ "learning_rate": 0.0004075215988876539,
1135
+ "loss": 5.7122,
1136
+ "step": 725000
1137
+ },
1138
+ {
1139
+ "epoch": 0.5730049517832032,
1140
+ "grad_norm": 5.519715309143066,
1141
+ "learning_rate": 0.00040631993176130574,
1142
+ "loss": 5.7093,
1143
+ "step": 730000
1144
+ },
1145
+ {
1146
+ "epoch": 0.576929643233773,
1147
+ "grad_norm": 6.44736385345459,
1148
+ "learning_rate": 0.00040511206274228083,
1149
+ "loss": 5.7029,
1150
+ "step": 735000
1151
+ },
1152
+ {
1153
+ "epoch": 0.580854334684343,
1154
+ "grad_norm": 5.93750524520874,
1155
+ "learning_rate": 0.00040389852112315274,
1156
+ "loss": 5.7025,
1157
+ "step": 740000
1158
+ },
1159
+ {
1160
+ "epoch": 0.5847790261349128,
1161
+ "grad_norm": 5.719981670379639,
1162
+ "learning_rate": 0.0004026793567057317,
1163
+ "loss": 5.7028,
1164
+ "step": 745000
1165
+ },
1166
+ {
1167
+ "epoch": 0.5887037175854827,
1168
+ "grad_norm": 6.105369567871094,
1169
+ "learning_rate": 0.00040145388229039284,
1170
+ "loss": 5.6877,
1171
+ "step": 750000
1172
+ },
1173
+ {
1174
+ "epoch": 0.5887037175854827,
1175
+ "eval_loss": 2.918062925338745,
1176
+ "eval_runtime": 1152.6996,
1177
+ "eval_samples_per_second": 187.339,
1178
+ "eval_steps_per_second": 23.418,
1179
+ "step": 750000
1180
+ },
1181
+ {
1182
+ "epoch": 0.5926284090360526,
1183
+ "grad_norm": 6.412558078765869,
1184
+ "learning_rate": 0.0004002223845301598,
1185
+ "loss": 5.6934,
1186
+ "step": 755000
1187
+ },
1188
+ {
1189
+ "epoch": 0.5965531004866225,
1190
+ "grad_norm": 5.392569541931152,
1191
+ "learning_rate": 0.0003989856510076611,
1192
+ "loss": 5.6804,
1193
+ "step": 760000
1194
+ },
1195
+ {
1196
+ "epoch": 0.6004777919371924,
1197
+ "grad_norm": 4.88016939163208,
1198
+ "learning_rate": 0.0003977434865733831,
1199
+ "loss": 5.6811,
1200
+ "step": 765000
1201
+ },
1202
+ {
1203
+ "epoch": 0.6044024833877623,
1204
+ "grad_norm": 3.566276788711548,
1205
+ "learning_rate": 0.0003964951911528423,
1206
+ "loss": 5.6704,
1207
+ "step": 770000
1208
+ },
1209
+ {
1210
+ "epoch": 0.6083271748383321,
1211
+ "grad_norm": 5.245337963104248,
1212
+ "learning_rate": 0.00039524130838671694,
1213
+ "loss": 5.6752,
1214
+ "step": 775000
1215
+ },
1216
+ {
1217
+ "epoch": 0.612251866288902,
1218
+ "grad_norm": 5.569911003112793,
1219
+ "learning_rate": 0.0003939823910692354,
1220
+ "loss": 5.678,
1221
+ "step": 780000
1222
+ },
1223
+ {
1224
+ "epoch": 0.6161765577394719,
1225
+ "grad_norm": 6.871520042419434,
1226
+ "learning_rate": 0.0003927174794803744,
1227
+ "loss": 5.6667,
1228
+ "step": 785000
1229
+ },
1230
+ {
1231
+ "epoch": 0.6201012491900418,
1232
+ "grad_norm": 4.4467034339904785,
1233
+ "learning_rate": 0.0003914473792912365,
1234
+ "loss": 5.656,
1235
+ "step": 790000
1236
+ },
1237
+ {
1238
+ "epoch": 0.6240259406406117,
1239
+ "grad_norm": 7.6152448654174805,
1240
+ "learning_rate": 0.00039017188643885047,
1241
+ "loss": 5.6461,
1242
+ "step": 795000
1243
+ },
1244
+ {
1245
+ "epoch": 0.6279506320911816,
1246
+ "grad_norm": 6.26963996887207,
1247
+ "learning_rate": 0.0003888910495512182,
1248
+ "loss": 5.6608,
1249
+ "step": 800000
1250
+ },
1251
+ {
1252
+ "epoch": 0.6279506320911816,
1253
+ "eval_loss": 2.8902182579040527,
1254
+ "eval_runtime": 1155.3086,
1255
+ "eval_samples_per_second": 186.916,
1256
+ "eval_steps_per_second": 23.365,
1257
+ "step": 800000
1258
+ },
1259
+ {
1260
+ "epoch": 0.6318753235417515,
1261
+ "grad_norm": 4.018621921539307,
1262
+ "learning_rate": 0.0003876054331712927,
1263
+ "loss": 5.6525,
1264
+ "step": 805000
1265
+ },
1266
+ {
1267
+ "epoch": 0.6358000149923213,
1268
+ "grad_norm": 5.362500190734863,
1269
+ "learning_rate": 0.00038631379809951025,
1270
+ "loss": 5.636,
1271
+ "step": 810000
1272
+ },
1273
+ {
1274
+ "epoch": 0.6397247064428913,
1275
+ "grad_norm": 6.672492027282715,
1276
+ "learning_rate": 0.00038501722393710636,
1277
+ "loss": 5.6273,
1278
+ "step": 815000
1279
+ },
1280
+ {
1281
+ "epoch": 0.6436493978934611,
1282
+ "grad_norm": 4.727617263793945,
1283
+ "learning_rate": 0.0003837152413000372,
1284
+ "loss": 5.6233,
1285
+ "step": 820000
1286
+ },
1287
+ {
1288
+ "epoch": 0.647574089344031,
1289
+ "grad_norm": 6.725893974304199,
1290
+ "learning_rate": 0.0003824086826970783,
1291
+ "loss": 5.6242,
1292
+ "step": 825000
1293
+ },
1294
+ {
1295
+ "epoch": 0.6514987807946009,
1296
+ "grad_norm": 5.975217819213867,
1297
+ "learning_rate": 0.0003810976040094335,
1298
+ "loss": 5.6134,
1299
+ "step": 830000
1300
+ },
1301
+ {
1302
+ "epoch": 0.6554234722451707,
1303
+ "grad_norm": 5.095362663269043,
1304
+ "learning_rate": 0.0003797804773430985,
1305
+ "loss": 5.619,
1306
+ "step": 835000
1307
+ },
1308
+ {
1309
+ "epoch": 0.6593481636957407,
1310
+ "grad_norm": 5.389176845550537,
1311
+ "learning_rate": 0.00037845813587251276,
1312
+ "loss": 5.614,
1313
+ "step": 840000
1314
+ },
1315
+ {
1316
+ "epoch": 0.6632728551463105,
1317
+ "grad_norm": 5.1916351318359375,
1318
+ "learning_rate": 0.000377131690836747,
1319
+ "loss": 5.6095,
1320
+ "step": 845000
1321
+ },
1322
+ {
1323
+ "epoch": 0.6671975465968805,
1324
+ "grad_norm": 6.15248966217041,
1325
+ "learning_rate": 0.0003757998670619693,
1326
+ "loss": 5.6033,
1327
+ "step": 850000
1328
+ },
1329
+ {
1330
+ "epoch": 0.6671975465968805,
1331
+ "eval_loss": 2.8692898750305176,
1332
+ "eval_runtime": 1175.9151,
1333
+ "eval_samples_per_second": 183.641,
1334
+ "eval_steps_per_second": 22.956,
1335
+ "step": 850000
1336
+ },
1337
+ {
1338
+ "epoch": 0.6711222380474503,
1339
+ "grad_norm": 6.101210594177246,
1340
+ "learning_rate": 0.0003744637809722457,
1341
+ "loss": 5.6013,
1342
+ "step": 855000
1343
+ },
1344
+ {
1345
+ "epoch": 0.6750469294980201,
1346
+ "grad_norm": 6.87960958480835,
1347
+ "learning_rate": 0.0003731226828172506,
1348
+ "loss": 5.592,
1349
+ "step": 860000
1350
+ },
1351
+ {
1352
+ "epoch": 0.6789716209485901,
1353
+ "grad_norm": 5.071200370788574,
1354
+ "learning_rate": 0.0003717768906257172,
1355
+ "loss": 5.5893,
1356
+ "step": 865000
1357
+ },
1358
+ {
1359
+ "epoch": 0.6828963123991599,
1360
+ "grad_norm": 7.438723564147949,
1361
+ "learning_rate": 0.0003704264557058006,
1362
+ "loss": 5.598,
1363
+ "step": 870000
1364
+ },
1365
+ {
1366
+ "epoch": 0.6868210038497299,
1367
+ "grad_norm": 6.021530628204346,
1368
+ "learning_rate": 0.00036907170111211227,
1369
+ "loss": 5.59,
1370
+ "step": 875000
1371
+ },
1372
+ {
1373
+ "epoch": 0.6907456953002997,
1374
+ "grad_norm": 5.211944103240967,
1375
+ "learning_rate": 0.000367711863796493,
1376
+ "loss": 5.5696,
1377
+ "step": 880000
1378
+ },
1379
+ {
1380
+ "epoch": 0.6946703867508696,
1381
+ "grad_norm": 6.795500755310059,
1382
+ "learning_rate": 0.00036634863039584095,
1383
+ "loss": 5.5723,
1384
+ "step": 885000
1385
+ },
1386
+ {
1387
+ "epoch": 0.6985950782014395,
1388
+ "grad_norm": 5.494241714477539,
1389
+ "learning_rate": 0.0003649795953068187,
1390
+ "loss": 5.5547,
1391
+ "step": 890000
1392
+ },
1393
+ {
1394
+ "epoch": 0.7025197696520094,
1395
+ "grad_norm": 6.3428544998168945,
1396
+ "learning_rate": 0.00036360672336543015,
1397
+ "loss": 5.5556,
1398
+ "step": 895000
1399
+ },
1400
+ {
1401
+ "epoch": 0.7064444611025793,
1402
+ "grad_norm": 6.741039276123047,
1403
+ "learning_rate": 0.0003622292442014995,
1404
+ "loss": 5.5512,
1405
+ "step": 900000
1406
+ },
1407
+ {
1408
+ "epoch": 0.7064444611025793,
1409
+ "eval_loss": 2.840968608856201,
1410
+ "eval_runtime": 1111.2113,
1411
+ "eval_samples_per_second": 194.334,
1412
+ "eval_steps_per_second": 24.292,
1413
+ "step": 900000
1414
+ }
1415
+ ],
1416
+ "logging_steps": 5000,
1417
+ "max_steps": 2547970,
1418
+ "num_input_tokens_seen": 0,
1419
+ "num_train_epochs": 2,
1420
+ "save_steps": 100000,
1421
+ "stateful_callbacks": {
1422
+ "TrainerControl": {
1423
+ "args": {
1424
+ "should_epoch_stop": false,
1425
+ "should_evaluate": false,
1426
+ "should_log": false,
1427
+ "should_save": true,
1428
+ "should_training_stop": false
1429
+ },
1430
+ "attributes": {}
1431
+ }
1432
+ },
1433
+ "total_flos": 8.752003246057943e+18,
1434
+ "train_batch_size": 8,
1435
+ "trial_name": null,
1436
+ "trial_params": null
1437
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2cc9b694f6483da0c07d8c6c9c2da9a68a7197ae2d74b602b5b263830cfbb5f
3
+ size 5432