astardusta commited on
Commit
2e722e6
·
verified ·
1 Parent(s): 24ad7fb

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +5 -5
  2. model.safetensors +1 -1
  3. train_results.json +5 -5
  4. trainer_state.json +127 -127
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0010977147303135788,
4
- "train_runtime": 1029.9631,
5
- "train_samples": 41,
6
- "train_samples_per_second": 0.078,
7
- "train_steps_per_second": 0.019
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 1185.7286,
5
+ "train_samples": 4,
6
+ "train_samples_per_second": 0.067,
7
+ "train_steps_per_second": 0.017
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce139ecd3281a7380df2c3936f21b3420de47386afe270772bb502c4acd4e901
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2697fc174e28904067fcab74d824745b5ab1e8819842bc47e81595810ea4dab8
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0010977147303135788,
4
- "train_runtime": 1029.9631,
5
- "train_samples": 41,
6
- "train_samples_per_second": 0.078,
7
- "train_steps_per_second": 0.019
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 1185.7286,
5
+ "train_samples": 4,
6
+ "train_samples_per_second": 0.067,
7
+ "train_steps_per_second": 0.017
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.975609756097561,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
@@ -16,31 +16,31 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 11.0,
20
- "completions/max_terminated_length": 11.0,
21
- "completions/mean_length": 7.0,
22
- "completions/mean_terminated_length": 7.0,
23
  "completions/min_length": 4.0,
24
  "completions/min_terminated_length": 4.0,
25
- "epoch": 0.0975609756097561,
26
- "frac_reward_zero_std": 0.5,
27
- "grad_norm": 71.97881317138672,
28
  "kl": 0.0,
29
  "learning_rate": 1e-06,
30
- "loss": -0.0071,
31
- "num_tokens": 2104.0,
32
- "reward": 1.305239737033844,
33
- "reward_std": 0.10550488950684667,
34
- "rewards/concensus_correctness_reward_func/mean": 0.7842499911785126,
35
- "rewards/concensus_correctness_reward_func/std": 0.9670694470405579,
36
- "rewards/consensus_reward_func/mean": 0.5,
37
- "rewards/consensus_reward_func/std": 0.5773502588272095,
38
  "rewards/cumulative_reward_2/mean": 0.0,
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
- "rewards/question_recreation_reward_func/mean": 0.02098984457552433,
43
- "rewards/question_recreation_reward_func/std": 0.014506918843835592,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -62,25 +62,25 @@
62
  "completions/mean_terminated_length": 4.0,
63
  "completions/min_length": 4.0,
64
  "completions/min_terminated_length": 4.0,
65
- "epoch": 0.1951219512195122,
66
  "frac_reward_zero_std": 1.0,
67
- "grad_norm": 7.450817065546289e-06,
68
- "kl": 0.00017640739679336548,
69
  "learning_rate": 9.729086208503173e-07,
70
  "loss": 0.0,
71
- "num_tokens": 4184.0,
72
- "reward": 2.4889785051345825,
73
  "reward_std": 0.0,
74
- "rewards/concensus_correctness_reward_func/mean": 1.940999984741211,
75
- "rewards/concensus_correctness_reward_func/std": 2.2412737607955933,
76
- "rewards/consensus_reward_func/mean": 0.5,
77
- "rewards/consensus_reward_func/std": 0.5773502588272095,
78
  "rewards/cumulative_reward_2/mean": 0.0,
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
- "rewards/question_recreation_reward_func/mean": 0.047978651942685246,
83
- "rewards/question_recreation_reward_func/std": 0.0,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -102,25 +102,25 @@
102
  "completions/mean_terminated_length": 4.0,
103
  "completions/min_length": 4.0,
104
  "completions/min_terminated_length": 4.0,
105
- "epoch": 0.2926829268292683,
106
  "frac_reward_zero_std": 1.0,
107
- "grad_norm": 0.00392628088593483,
108
- "kl": 0.00011070072650909424,
109
  "learning_rate": 8.945702546981968e-07,
110
  "loss": 0.0,
111
- "num_tokens": 6264.0,
112
- "reward": 2.96940016746521,
113
  "reward_std": 0.0,
114
- "rewards/concensus_correctness_reward_func/mean": 1.4599999785423279,
115
- "rewards/concensus_correctness_reward_func/std": 1.1085125207901,
116
- "rewards/consensus_reward_func/mean": 1.5,
117
- "rewards/consensus_reward_func/std": 0.5773502588272095,
118
  "rewards/cumulative_reward_2/mean": 0.0,
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
- "rewards/question_recreation_reward_func/mean": 0.009400193579494953,
123
- "rewards/question_recreation_reward_func/std": 0.0027492870576679707,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -136,31 +136,31 @@
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
  "completions/clipped_ratio": 0.0,
139
- "completions/max_length": 5.0,
140
- "completions/max_terminated_length": 5.0,
141
- "completions/mean_length": 4.25,
142
- "completions/mean_terminated_length": 4.25,
143
  "completions/min_length": 4.0,
144
  "completions/min_terminated_length": 4.0,
145
- "epoch": 0.3902439024390244,
146
- "frac_reward_zero_std": 0.75,
147
- "grad_norm": 1.0253148730043904e-06,
148
- "kl": 0.28631603345274925,
149
  "learning_rate": 7.734740790612136e-07,
150
- "loss": 0.018,
151
- "num_tokens": 8346.0,
152
- "reward": 2.461086630821228,
153
- "reward_std": 0.6597800850868225,
154
- "rewards/concensus_correctness_reward_func/mean": 1.448500007390976,
155
- "rewards/concensus_correctness_reward_func/std": 2.0686065554618835,
156
- "rewards/consensus_reward_func/mean": 1.0,
157
- "rewards/consensus_reward_func/std": 1.154700517654419,
158
  "rewards/cumulative_reward_2/mean": 0.0,
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
- "rewards/question_recreation_reward_func/mean": 0.012586628086864948,
163
- "rewards/question_recreation_reward_func/std": 0.004967818967998028,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -182,25 +182,25 @@
182
  "completions/mean_terminated_length": 4.0,
183
  "completions/min_length": 4.0,
184
  "completions/min_terminated_length": 4.0,
185
- "epoch": 0.4878048780487805,
186
  "frac_reward_zero_std": 1.0,
187
- "grad_norm": 0.06691131740808487,
188
- "kl": 0.06441881880164146,
189
  "learning_rate": 6.227427435703995e-07,
190
- "loss": 0.0001,
191
- "num_tokens": 10426.0,
192
- "reward": 2.5516220331192017,
193
  "reward_std": 0.0,
194
- "rewards/concensus_correctness_reward_func/mean": 1.0370000302791595,
195
- "rewards/concensus_correctness_reward_func/std": 1.1974244713783264,
196
- "rewards/consensus_reward_func/mean": 1.5,
197
- "rewards/consensus_reward_func/std": 0.5773502588272095,
198
  "rewards/cumulative_reward_2/mean": 0.0,
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
- "rewards/question_recreation_reward_func/mean": 0.014621995389461517,
203
- "rewards/question_recreation_reward_func/std": 0.0,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -222,24 +222,24 @@
222
  "completions/mean_terminated_length": 4.0,
223
  "completions/min_length": 4.0,
224
  "completions/min_terminated_length": 4.0,
225
- "epoch": 0.5853658536585366,
226
  "frac_reward_zero_std": 1.0,
227
- "grad_norm": 1.1406209523556754e-06,
228
- "kl": 7.622689008712769e-05,
229
  "learning_rate": 4.5871032726383385e-07,
230
  "loss": 0.0,
231
- "num_tokens": 12506.0,
232
- "reward": 1.0048940181732178,
233
  "reward_std": 0.0,
234
- "rewards/concensus_correctness_reward_func/mean": 0.5,
235
- "rewards/concensus_correctness_reward_func/std": 0.5773502588272095,
236
- "rewards/consensus_reward_func/mean": 0.5,
237
- "rewards/consensus_reward_func/std": 0.5773502588272095,
238
  "rewards/cumulative_reward_2/mean": 0.0,
239
  "rewards/cumulative_reward_2/std": 0.0,
240
  "rewards/final_correctness_reward_func/mean": 0.0,
241
  "rewards/final_correctness_reward_func/std": 0.0,
242
- "rewards/question_recreation_reward_func/mean": 0.004893964156508446,
243
  "rewards/question_recreation_reward_func/std": 0.0,
244
  "rewards/soft_format_reward_func/mean": 0.0,
245
  "rewards/soft_format_reward_func/std": 0.0,
@@ -262,24 +262,24 @@
262
  "completions/mean_terminated_length": 4.0,
263
  "completions/min_length": 4.0,
264
  "completions/min_terminated_length": 4.0,
265
- "epoch": 0.6829268292682927,
266
  "frac_reward_zero_std": 1.0,
267
- "grad_norm": 5.935615990892984e-05,
268
- "kl": 0.00036910921335220337,
269
  "learning_rate": 2.9915228767351535e-07,
270
  "loss": 0.0,
271
- "num_tokens": 14586.0,
272
- "reward": 0.9908410720527172,
273
  "reward_std": 0.0,
274
- "rewards/concensus_correctness_reward_func/mean": 0.9620000123977661,
275
- "rewards/concensus_correctness_reward_func/std": 0.0,
276
  "rewards/consensus_reward_func/mean": 0.0,
277
  "rewards/consensus_reward_func/std": 0.0,
278
  "rewards/cumulative_reward_2/mean": 0.0,
279
  "rewards/cumulative_reward_2/std": 0.0,
280
  "rewards/final_correctness_reward_func/mean": 0.0,
281
  "rewards/final_correctness_reward_func/std": 0.0,
282
- "rewards/question_recreation_reward_func/mean": 0.02884104219265282,
283
  "rewards/question_recreation_reward_func/std": 0.0,
284
  "rewards/soft_format_reward_func/mean": 0.0,
285
  "rewards/soft_format_reward_func/std": 0.0,
@@ -302,25 +302,25 @@
302
  "completions/mean_terminated_length": 4.0,
303
  "completions/min_length": 4.0,
304
  "completions/min_terminated_length": 4.0,
305
- "epoch": 0.7804878048780488,
306
  "frac_reward_zero_std": 1.0,
307
- "grad_norm": 4.315011778999178e-07,
308
- "kl": 1.2665987014770508e-07,
309
  "learning_rate": 1.6135921418712955e-07,
310
  "loss": 0.0,
311
- "num_tokens": 16666.0,
312
- "reward": 3.3268758058547974,
313
  "reward_std": 0.0,
314
- "rewards/concensus_correctness_reward_func/mean": 2.2880000174045563,
315
- "rewards/concensus_correctness_reward_func/std": 0.7355442643165588,
316
- "rewards/consensus_reward_func/mean": 1.0,
317
- "rewards/consensus_reward_func/std": 1.154700517654419,
318
  "rewards/cumulative_reward_2/mean": 0.0,
319
  "rewards/cumulative_reward_2/std": 0.0,
320
  "rewards/final_correctness_reward_func/mean": 0.0,
321
  "rewards/final_correctness_reward_func/std": 0.0,
322
- "rewards/question_recreation_reward_func/mean": 0.03887587878853083,
323
- "rewards/question_recreation_reward_func/std": 0.0031549197155982256,
324
  "rewards/soft_format_reward_func/mean": 0.0,
325
  "rewards/soft_format_reward_func/std": 0.0,
326
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -342,25 +342,25 @@
342
  "completions/mean_terminated_length": 4.0,
343
  "completions/min_length": 4.0,
344
  "completions/min_terminated_length": 4.0,
345
- "epoch": 0.8780487804878049,
346
  "frac_reward_zero_std": 1.0,
347
- "grad_norm": 5.20552760008286e-07,
348
- "kl": 0.001730598509311676,
349
  "learning_rate": 6.026312439675551e-08,
350
  "loss": 0.0,
351
- "num_tokens": 18746.0,
352
- "reward": 3.8770835399627686,
353
  "reward_std": 0.0,
354
- "rewards/concensus_correctness_reward_func/mean": 2.825500011444092,
355
- "rewards/concensus_correctness_reward_func/std": 1.0502002239227295,
356
- "rewards/consensus_reward_func/mean": 1.0,
357
- "rewards/consensus_reward_func/std": 1.154700517654419,
358
  "rewards/cumulative_reward_2/mean": 0.0,
359
  "rewards/cumulative_reward_2/std": 0.0,
360
  "rewards/final_correctness_reward_func/mean": 0.0,
361
  "rewards/final_correctness_reward_func/std": 0.0,
362
- "rewards/question_recreation_reward_func/mean": 0.05158371292054653,
363
- "rewards/question_recreation_reward_func/std": 0.003396179061383009,
364
  "rewards/soft_format_reward_func/mean": 0.0,
365
  "rewards/soft_format_reward_func/std": 0.0,
366
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -382,24 +382,24 @@
382
  "completions/mean_terminated_length": 4.0,
383
  "completions/min_length": 4.0,
384
  "completions/min_terminated_length": 4.0,
385
- "epoch": 0.975609756097561,
386
  "frac_reward_zero_std": 1.0,
387
- "grad_norm": 2.5999270292231813e-06,
388
- "kl": 0.047765836119651794,
389
  "learning_rate": 6.819348298638839e-09,
390
  "loss": 0.0,
391
- "num_tokens": 20826.0,
392
- "reward": 1.658750057220459,
393
  "reward_std": 0.0,
394
- "rewards/concensus_correctness_reward_func/mean": 1.0805000066757202,
395
- "rewards/concensus_correctness_reward_func/std": 1.2476539611816406,
396
- "rewards/consensus_reward_func/mean": 0.5,
397
- "rewards/consensus_reward_func/std": 0.5773502588272095,
398
  "rewards/cumulative_reward_2/mean": 0.0,
399
  "rewards/cumulative_reward_2/std": 0.0,
400
  "rewards/final_correctness_reward_func/mean": 0.0,
401
  "rewards/final_correctness_reward_func/std": 0.0,
402
- "rewards/question_recreation_reward_func/mean": 0.0782500971108675,
403
  "rewards/question_recreation_reward_func/std": 0.0,
404
  "rewards/soft_format_reward_func/mean": 0.0,
405
  "rewards/soft_format_reward_func/std": 0.0,
@@ -410,19 +410,19 @@
410
  "step": 20
411
  },
412
  {
413
- "epoch": 0.975609756097561,
414
  "step": 20,
415
  "total_flos": 0.0,
416
- "train_loss": 0.0010977147303135788,
417
- "train_runtime": 1029.9631,
418
- "train_samples_per_second": 0.078,
419
- "train_steps_per_second": 0.019
420
  }
421
  ],
422
  "logging_steps": 2,
423
  "max_steps": 20,
424
- "num_input_tokens_seen": 20826,
425
- "num_train_epochs": 1,
426
  "save_steps": 25,
427
  "stateful_callbacks": {
428
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 4.0,
20
+ "completions/max_terminated_length": 4.0,
21
+ "completions/mean_length": 4.0,
22
+ "completions/mean_terminated_length": 4.0,
23
  "completions/min_length": 4.0,
24
  "completions/min_terminated_length": 4.0,
25
+ "epoch": 1.0,
26
+ "frac_reward_zero_std": 1.0,
27
+ "grad_norm": 0.0,
28
  "kl": 0.0,
29
  "learning_rate": 1e-06,
30
+ "loss": 0.0,
31
+ "num_tokens": 2080.0,
32
+ "reward": 2.215920627117157,
33
+ "reward_std": 0.0,
34
+ "rewards/concensus_correctness_reward_func/mean": 2.1505000591278076,
35
+ "rewards/concensus_correctness_reward_func/std": 0.9416583180427551,
36
+ "rewards/consensus_reward_func/mean": 0.0,
37
+ "rewards/consensus_reward_func/std": 0.0,
38
  "rewards/cumulative_reward_2/mean": 0.0,
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
+ "rewards/question_recreation_reward_func/mean": 0.06542056053876877,
43
+ "rewards/question_recreation_reward_func/std": 0.0,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
 
62
  "completions/mean_terminated_length": 4.0,
63
  "completions/min_length": 4.0,
64
  "completions/min_terminated_length": 4.0,
65
+ "epoch": 2.0,
66
  "frac_reward_zero_std": 1.0,
67
+ "grad_norm": 0.0,
68
+ "kl": 0.0,
69
  "learning_rate": 9.729086208503173e-07,
70
  "loss": 0.0,
71
+ "num_tokens": 4160.0,
72
+ "reward": 0.6780470022931695,
73
  "reward_std": 0.0,
74
+ "rewards/concensus_correctness_reward_func/mean": 0.6675000190734863,
75
+ "rewards/concensus_correctness_reward_func/std": 0.7707626819610596,
76
+ "rewards/consensus_reward_func/mean": 0.0,
77
+ "rewards/consensus_reward_func/std": 0.0,
78
  "rewards/cumulative_reward_2/mean": 0.0,
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
+ "rewards/question_recreation_reward_func/mean": 0.010547009063884616,
83
+ "rewards/question_recreation_reward_func/std": 0.002309401286765933,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
 
102
  "completions/mean_terminated_length": 4.0,
103
  "completions/min_length": 4.0,
104
  "completions/min_terminated_length": 4.0,
105
+ "epoch": 3.0,
106
  "frac_reward_zero_std": 1.0,
107
+ "grad_norm": 0.0,
108
+ "kl": 0.0,
109
  "learning_rate": 8.945702546981968e-07,
110
  "loss": 0.0,
111
+ "num_tokens": 6240.0,
112
+ "reward": 2.3659205436706543,
113
  "reward_std": 0.0,
114
+ "rewards/concensus_correctness_reward_func/mean": 2.2985000610351562,
115
+ "rewards/concensus_correctness_reward_func/std": 0.7707626819610596,
116
+ "rewards/consensus_reward_func/mean": 0.0,
117
+ "rewards/consensus_reward_func/std": 0.0,
118
  "rewards/cumulative_reward_2/mean": 0.0,
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
+ "rewards/question_recreation_reward_func/mean": 0.06742056063376367,
123
+ "rewards/question_recreation_reward_func/std": 0.002309401286765933,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
 
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
  "completions/clipped_ratio": 0.0,
139
+ "completions/max_length": 4.0,
140
+ "completions/max_terminated_length": 4.0,
141
+ "completions/mean_length": 4.0,
142
+ "completions/mean_terminated_length": 4.0,
143
  "completions/min_length": 4.0,
144
  "completions/min_terminated_length": 4.0,
145
+ "epoch": 4.0,
146
+ "frac_reward_zero_std": 1.0,
147
+ "grad_norm": 0.0,
148
+ "kl": 0.0,
149
  "learning_rate": 7.734740790612136e-07,
150
+ "loss": 0.0,
151
+ "num_tokens": 8320.0,
152
+ "reward": 3.709144711494446,
153
+ "reward_std": 0.0,
154
+ "rewards/concensus_correctness_reward_func/mean": 3.5920000076293945,
155
+ "rewards/concensus_correctness_reward_func/std": 0.0011546856258064508,
156
+ "rewards/consensus_reward_func/mean": 0.0,
157
+ "rewards/consensus_reward_func/std": 0.0,
158
  "rewards/cumulative_reward_2/mean": 0.0,
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
+ "rewards/question_recreation_reward_func/mean": 0.11714470013976097,
163
+ "rewards/question_recreation_reward_func/std": 0.0,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
 
182
  "completions/mean_terminated_length": 4.0,
183
  "completions/min_length": 4.0,
184
  "completions/min_terminated_length": 4.0,
185
+ "epoch": 5.0,
186
  "frac_reward_zero_std": 1.0,
187
+ "grad_norm": 0.0,
188
+ "kl": 0.0,
189
  "learning_rate": 6.227427435703995e-07,
190
+ "loss": 0.0,
191
+ "num_tokens": 10400.0,
192
+ "reward": 0.6780470022931695,
193
  "reward_std": 0.0,
194
+ "rewards/concensus_correctness_reward_func/mean": 0.6675000190734863,
195
+ "rewards/concensus_correctness_reward_func/std": 0.7707626819610596,
196
+ "rewards/consensus_reward_func/mean": 0.0,
197
+ "rewards/consensus_reward_func/std": 0.0,
198
  "rewards/cumulative_reward_2/mean": 0.0,
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
+ "rewards/question_recreation_reward_func/mean": 0.010547009063884616,
203
+ "rewards/question_recreation_reward_func/std": 0.002309401286765933,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
 
222
  "completions/mean_terminated_length": 4.0,
223
  "completions/min_length": 4.0,
224
  "completions/min_terminated_length": 4.0,
225
+ "epoch": 6.0,
226
  "frac_reward_zero_std": 1.0,
227
+ "grad_norm": 0.0,
228
+ "kl": 0.0,
229
  "learning_rate": 4.5871032726383385e-07,
230
  "loss": 0.0,
231
+ "num_tokens": 12480.0,
232
+ "reward": 3.709144711494446,
233
  "reward_std": 0.0,
234
+ "rewards/concensus_correctness_reward_func/mean": 3.5920000076293945,
235
+ "rewards/concensus_correctness_reward_func/std": 0.0011546856258064508,
236
+ "rewards/consensus_reward_func/mean": 0.0,
237
+ "rewards/consensus_reward_func/std": 0.0,
238
  "rewards/cumulative_reward_2/mean": 0.0,
239
  "rewards/cumulative_reward_2/std": 0.0,
240
  "rewards/final_correctness_reward_func/mean": 0.0,
241
  "rewards/final_correctness_reward_func/std": 0.0,
242
+ "rewards/question_recreation_reward_func/mean": 0.11714470013976097,
243
  "rewards/question_recreation_reward_func/std": 0.0,
244
  "rewards/soft_format_reward_func/mean": 0.0,
245
  "rewards/soft_format_reward_func/std": 0.0,
 
262
  "completions/mean_terminated_length": 4.0,
263
  "completions/min_length": 4.0,
264
  "completions/min_terminated_length": 4.0,
265
+ "epoch": 7.0,
266
  "frac_reward_zero_std": 1.0,
267
+ "grad_norm": 0.0,
268
+ "kl": 0.0,
269
  "learning_rate": 2.9915228767351535e-07,
270
  "loss": 0.0,
271
+ "num_tokens": 14560.0,
272
+ "reward": 0.889467597939074,
273
  "reward_std": 0.0,
274
+ "rewards/concensus_correctness_reward_func/mean": 0.815500020980835,
275
+ "rewards/concensus_correctness_reward_func/std": 0.9416583180427551,
276
  "rewards/consensus_reward_func/mean": 0.0,
277
  "rewards/consensus_reward_func/std": 0.0,
278
  "rewards/cumulative_reward_2/mean": 0.0,
279
  "rewards/cumulative_reward_2/std": 0.0,
280
  "rewards/final_correctness_reward_func/mean": 0.0,
281
  "rewards/final_correctness_reward_func/std": 0.0,
282
+ "rewards/question_recreation_reward_func/mean": 0.07396756950765848,
283
  "rewards/question_recreation_reward_func/std": 0.0,
284
  "rewards/soft_format_reward_func/mean": 0.0,
285
  "rewards/soft_format_reward_func/std": 0.0,
 
302
  "completions/mean_terminated_length": 4.0,
303
  "completions/min_length": 4.0,
304
  "completions/min_terminated_length": 4.0,
305
+ "epoch": 8.0,
306
  "frac_reward_zero_std": 1.0,
307
+ "grad_norm": 0.0,
308
+ "kl": 0.0,
309
  "learning_rate": 1.6135921418712955e-07,
310
  "loss": 0.0,
311
+ "num_tokens": 16640.0,
312
+ "reward": 1.7049675593152642,
313
  "reward_std": 0.0,
314
+ "rewards/concensus_correctness_reward_func/mean": 1.63100004196167,
315
+ "rewards/concensus_correctness_reward_func/std": 0.0,
316
+ "rewards/consensus_reward_func/mean": 0.0,
317
+ "rewards/consensus_reward_func/std": 0.0,
318
  "rewards/cumulative_reward_2/mean": 0.0,
319
  "rewards/cumulative_reward_2/std": 0.0,
320
  "rewards/final_correctness_reward_func/mean": 0.0,
321
  "rewards/final_correctness_reward_func/std": 0.0,
322
+ "rewards/question_recreation_reward_func/mean": 0.07396756950765848,
323
+ "rewards/question_recreation_reward_func/std": 0.0,
324
  "rewards/soft_format_reward_func/mean": 0.0,
325
  "rewards/soft_format_reward_func/std": 0.0,
326
  "rewards/strict_format_reward_func/mean": 0.0,
 
342
  "completions/mean_terminated_length": 4.0,
343
  "completions/min_length": 4.0,
344
  "completions/min_terminated_length": 4.0,
345
+ "epoch": 9.0,
346
  "frac_reward_zero_std": 1.0,
347
+ "grad_norm": 0.0,
348
+ "kl": 0.0,
349
  "learning_rate": 6.026312439675551e-08,
350
  "loss": 0.0,
351
+ "num_tokens": 18720.0,
352
+ "reward": 0.889467597939074,
353
  "reward_std": 0.0,
354
+ "rewards/concensus_correctness_reward_func/mean": 0.815500020980835,
355
+ "rewards/concensus_correctness_reward_func/std": 0.9416583180427551,
356
+ "rewards/consensus_reward_func/mean": 0.0,
357
+ "rewards/consensus_reward_func/std": 0.0,
358
  "rewards/cumulative_reward_2/mean": 0.0,
359
  "rewards/cumulative_reward_2/std": 0.0,
360
  "rewards/final_correctness_reward_func/mean": 0.0,
361
  "rewards/final_correctness_reward_func/std": 0.0,
362
+ "rewards/question_recreation_reward_func/mean": 0.07396756950765848,
363
+ "rewards/question_recreation_reward_func/std": 0.0,
364
  "rewards/soft_format_reward_func/mean": 0.0,
365
  "rewards/soft_format_reward_func/std": 0.0,
366
  "rewards/strict_format_reward_func/mean": 0.0,
 
382
  "completions/mean_terminated_length": 4.0,
383
  "completions/min_length": 4.0,
384
  "completions/min_terminated_length": 4.0,
385
+ "epoch": 10.0,
386
  "frac_reward_zero_std": 1.0,
387
+ "grad_norm": 0.0,
388
+ "kl": 0.0,
389
  "learning_rate": 6.819348298638839e-09,
390
  "loss": 0.0,
391
+ "num_tokens": 20800.0,
392
+ "reward": 2.021271170116961,
393
  "reward_std": 0.0,
394
+ "rewards/concensus_correctness_reward_func/mean": 1.9609999656677246,
395
+ "rewards/concensus_correctness_reward_func/std": 0.0011546856258064508,
396
+ "rewards/consensus_reward_func/mean": 0.0,
397
+ "rewards/consensus_reward_func/std": 0.0,
398
  "rewards/cumulative_reward_2/mean": 0.0,
399
  "rewards/cumulative_reward_2/std": 0.0,
400
  "rewards/final_correctness_reward_func/mean": 0.0,
401
  "rewards/final_correctness_reward_func/std": 0.0,
402
+ "rewards/question_recreation_reward_func/mean": 0.060271148569881916,
403
  "rewards/question_recreation_reward_func/std": 0.0,
404
  "rewards/soft_format_reward_func/mean": 0.0,
405
  "rewards/soft_format_reward_func/std": 0.0,
 
410
  "step": 20
411
  },
412
  {
413
+ "epoch": 10.0,
414
  "step": 20,
415
  "total_flos": 0.0,
416
+ "train_loss": 0.0,
417
+ "train_runtime": 1185.7286,
418
+ "train_samples_per_second": 0.067,
419
+ "train_steps_per_second": 0.017
420
  }
421
  ],
422
  "logging_steps": 2,
423
  "max_steps": 20,
424
+ "num_input_tokens_seen": 20800,
425
+ "num_train_epochs": 10,
426
  "save_steps": 25,
427
  "stateful_callbacks": {
428
  "TrainerControl": {