kingproabc commited on
Commit
5a24d28
·
verified ·
1 Parent(s): 9c20bf1

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00012401715792293545,
4
- "train_runtime": 473.4063,
5
- "train_samples": 800,
6
- "train_samples_per_second": 3.38,
7
- "train_steps_per_second": 0.211
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 4.617630838765763e-06,
4
+ "train_runtime": 463.4946,
5
+ "train_samples": 599,
6
+ "train_samples_per_second": 0.69,
7
+ "train_steps_per_second": 0.043
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d75109916f9a05e704226ecdb907c05685845a188bffa960f80a1af9c6ef7400
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a57fd08d5b5d0ca9650c6d1a6a1f2bdab037daed303c9a4ec056d64690a2823
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00012401715792293545,
4
- "train_runtime": 473.4063,
5
- "train_samples": 800,
6
- "train_samples_per_second": 3.38,
7
- "train_steps_per_second": 0.211
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 4.617630838765763e-06,
4
+ "train_runtime": 463.4946,
5
+ "train_samples": 599,
6
+ "train_samples_per_second": 0.69,
7
+ "train_steps_per_second": 0.043
8
  }
trainer_state.json CHANGED
@@ -2,975 +2,215 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 227.09375,
14
- "epoch": 0.02,
15
- "grad_norm": 2.520681619644165,
16
  "kl": 0.0,
17
- "learning_rate": 8e-07,
18
  "loss": 0.0,
19
- "reward": 4.047884318977594,
20
- "reward_std": 0.46945752715691924,
21
- "rewards/concensus_correctness_reward_func": 0.9866875074803829,
22
- "rewards/consensus_reward_func": 1.0625,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 0.125,
25
- "rewards/question_recreation_reward_func": 0.7446030541323125,
26
  "rewards/soft_format_reward_func": 0.0,
27
- "rewards/strict_format_reward_func": 0.25,
28
- "rewards/xmlcount_reward_func": 0.879093749448657,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 125.96875,
33
- "epoch": 0.04,
34
- "grad_norm": 0.0030505801551043987,
35
- "kl": 6.544499410665594e-05,
36
- "learning_rate": 1.199685341427309e-06,
37
  "loss": 0.0,
38
- "reward": 6.914750009775162,
39
- "reward_std": 0.0,
40
- "rewards/concensus_correctness_reward_func": 2.039750002324581,
41
- "rewards/consensus_reward_func": 2.0,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 0.125,
44
- "rewards/question_recreation_reward_func": 1.0,
45
  "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.5,
47
- "rewards/xmlcount_reward_func": 1.25,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 129.625,
52
- "epoch": 0.06,
53
- "grad_norm": 0.09264357388019562,
54
- "kl": 0.0016755392222194132,
55
- "learning_rate": 1.1971700526999683e-06,
56
  "loss": 0.0,
57
- "reward": 6.944124937057495,
58
- "reward_std": 0.022097086533904076,
59
- "rewards/concensus_correctness_reward_func": 1.9597500190138817,
60
- "rewards/consensus_reward_func": 2.0,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 0.25,
63
- "rewards/question_recreation_reward_func": 1.0,
64
  "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.484375,
66
- "rewards/xmlcount_reward_func": 1.25,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 112.6875,
71
- "epoch": 0.08,
72
- "grad_norm": 0.3436090648174286,
73
- "kl": 0.0025850895719941036,
74
- "learning_rate": 1.1921500252403342e-06,
75
  "loss": 0.0,
76
- "reward": 6.916000008583069,
77
- "reward_std": 0.0,
78
- "rewards/concensus_correctness_reward_func": 2.041000008583069,
79
- "rewards/consensus_reward_func": 2.0,
80
  "rewards/cumulative_reward_2": 0.0,
81
  "rewards/final_correctness_reward_func": 0.125,
82
- "rewards/question_recreation_reward_func": 1.0,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.5,
85
- "rewards/xmlcount_reward_func": 1.25,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 118.15625,
90
- "epoch": 0.1,
91
- "grad_norm": 0.03444274142384529,
92
- "kl": 0.0006890022309562482,
93
- "learning_rate": 1.1846463147881556e-06,
94
  "loss": 0.0,
95
- "reward": 7.161250025033951,
96
- "reward_std": 0.0,
97
- "rewards/concensus_correctness_reward_func": 2.161250025033951,
98
- "rewards/consensus_reward_func": 2.0,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 0.25,
101
- "rewards/question_recreation_reward_func": 1.0,
102
  "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.5,
104
- "rewards/xmlcount_reward_func": 1.25,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 110.40625,
109
- "epoch": 0.12,
110
- "grad_norm": 0.47671496868133545,
111
- "kl": 0.01094875360485048,
112
- "learning_rate": 1.174690394512792e-06,
113
- "loss": 0.0002,
114
- "reward": 6.730374991893768,
115
- "reward_std": 0.0,
116
- "rewards/concensus_correctness_reward_func": 1.8553750216960907,
117
- "rewards/consensus_reward_func": 2.0,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 0.125,
120
- "rewards/question_recreation_reward_func": 1.0,
121
  "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.5,
123
- "rewards/xmlcount_reward_func": 1.25,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 119.8125,
128
- "epoch": 0.14,
129
- "grad_norm": 0.002810833742842078,
130
- "kl": 0.0007294444491208196,
131
- "learning_rate": 1.1623240230038062e-06,
132
  "loss": 0.0,
133
- "reward": 6.912124991416931,
134
- "reward_std": 0.0,
135
- "rewards/concensus_correctness_reward_func": 2.0371250063180923,
136
- "rewards/consensus_reward_func": 2.0,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 0.125,
139
- "rewards/question_recreation_reward_func": 1.0,
140
  "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.5,
142
- "rewards/xmlcount_reward_func": 1.25,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 110.34375,
147
- "epoch": 0.16,
148
- "grad_norm": 0.006699176039546728,
149
- "kl": 0.0008764536623857566,
150
- "learning_rate": 1.147599069120924e-06,
151
  "loss": 0.0,
152
- "reward": 7.165000021457672,
153
- "reward_std": 0.0,
154
- "rewards/concensus_correctness_reward_func": 2.165000021457672,
155
- "rewards/consensus_reward_func": 2.0,
156
- "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.25,
158
- "rewards/question_recreation_reward_func": 1.0,
159
- "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.5,
161
- "rewards/xmlcount_reward_func": 1.25,
162
- "step": 16
163
- },
164
- {
165
- "completion_length": 107.6875,
166
- "epoch": 0.18,
167
- "grad_norm": 0.608063280582428,
168
- "kl": 0.012466257388950908,
169
- "learning_rate": 1.1305772944380141e-06,
170
- "loss": 0.0002,
171
- "reward": 7.613874942064285,
172
- "reward_std": 0.022097086533904076,
173
- "rewards/concensus_correctness_reward_func": 2.3795000091195107,
174
- "rewards/consensus_reward_func": 2.0,
175
- "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 0.5,
177
- "rewards/question_recreation_reward_func": 1.0,
178
- "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.484375,
180
- "rewards/xmlcount_reward_func": 1.25,
181
- "step": 18
182
- },
183
- {
184
- "completion_length": 113.71875,
185
- "epoch": 0.2,
186
- "grad_norm": 0.03589734435081482,
187
- "kl": 0.002400611226789806,
188
- "learning_rate": 1.1113300941935815e-06,
189
- "loss": 0.0,
190
- "reward": 6.913101881742477,
191
- "reward_std": 0.0002745024103205651,
192
- "rewards/concensus_correctness_reward_func": 2.038499988615513,
193
- "rewards/consensus_reward_func": 2.0,
194
  "rewards/cumulative_reward_2": 0.0,
195
  "rewards/final_correctness_reward_func": 0.125,
196
- "rewards/question_recreation_reward_func": 0.9996018745005131,
197
  "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.5,
199
- "rewards/xmlcount_reward_func": 1.25,
200
- "step": 20
201
- },
202
- {
203
- "completion_length": 128.1875,
204
- "epoch": 0.22,
205
- "grad_norm": 0.017112350091338158,
206
- "kl": 0.0014598190628021257,
207
- "learning_rate": 1.0899381978343265e-06,
208
- "loss": 0.0,
209
- "reward": 6.634874999523163,
210
- "reward_std": 0.0,
211
- "rewards/concensus_correctness_reward_func": 1.884875014424324,
212
- "rewards/consensus_reward_func": 2.0,
213
- "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 0.0,
215
- "rewards/question_recreation_reward_func": 1.0,
216
- "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.5,
218
- "rewards/xmlcount_reward_func": 1.25,
219
- "step": 22
220
- },
221
- {
222
- "completion_length": 112.6875,
223
- "epoch": 0.24,
224
- "grad_norm": 0.048251923173666,
225
- "kl": 0.0009265093149224413,
226
- "learning_rate": 1.0664913304077896e-06,
227
- "loss": 0.0,
228
- "reward": 6.918624967336655,
229
- "reward_std": 0.0,
230
- "rewards/concensus_correctness_reward_func": 2.0436250120401382,
231
- "rewards/consensus_reward_func": 2.0,
232
- "rewards/cumulative_reward_2": 0.0,
233
- "rewards/final_correctness_reward_func": 0.125,
234
- "rewards/question_recreation_reward_func": 1.0,
235
- "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.5,
237
- "rewards/xmlcount_reward_func": 1.25,
238
- "step": 24
239
  },
240
  {
241
- "completion_length": 127.8125,
242
- "epoch": 0.26,
243
- "grad_norm": 0.6262523531913757,
244
- "kl": 0.0013364659944272717,
245
- "learning_rate": 1.0410878362243202e-06,
246
  "loss": 0.0,
247
- "reward": 6.8369995057582855,
248
- "reward_std": 0.08856584758905228,
249
- "rewards/concensus_correctness_reward_func": 2.024624988436699,
250
- "rewards/consensus_reward_func": 2.0,
251
  "rewards/cumulative_reward_2": 0.0,
252
  "rewards/final_correctness_reward_func": 0.0625,
253
- "rewards/question_recreation_reward_func": 0.9998744986951351,
254
- "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.5,
256
- "rewards/xmlcount_reward_func": 1.25,
257
- "step": 26
258
- },
259
- {
260
- "completion_length": 105.0625,
261
- "epoch": 0.28,
262
- "grad_norm": 0.02676808461546898,
263
- "kl": 0.0018403055837552529,
264
- "learning_rate": 1.0138342663668626e-06,
265
- "loss": 0.0,
266
- "reward": 6.66587495803833,
267
- "reward_std": 0.0,
268
- "rewards/concensus_correctness_reward_func": 1.9158750250935555,
269
- "rewards/consensus_reward_func": 2.0,
270
- "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 0.0,
272
- "rewards/question_recreation_reward_func": 1.0,
273
- "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.5,
275
- "rewards/xmlcount_reward_func": 1.25,
276
- "step": 28
277
- },
278
- {
279
- "completion_length": 113.34375,
280
- "epoch": 0.3,
281
- "grad_norm": 0.02248120680451393,
282
- "kl": 0.001570980580709147,
283
- "learning_rate": 9.848449317786839e-07,
284
- "loss": 0.0,
285
- "reward": 6.9081249833106995,
286
- "reward_std": 0.0,
287
- "rewards/concensus_correctness_reward_func": 2.0331250056624413,
288
- "rewards/consensus_reward_func": 2.0,
289
- "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 0.125,
291
- "rewards/question_recreation_reward_func": 1.0,
292
- "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.5,
294
- "rewards/xmlcount_reward_func": 1.25,
295
- "step": 30
296
- },
297
- {
298
- "completion_length": 111.875,
299
- "epoch": 0.32,
300
- "grad_norm": 0.06672874093055725,
301
- "kl": 0.002423093282232003,
302
- "learning_rate": 9.542414238035536e-07,
303
- "loss": 0.0,
304
- "reward": 7.160874992609024,
305
- "reward_std": 0.0,
306
- "rewards/concensus_correctness_reward_func": 2.1608749851584435,
307
- "rewards/consensus_reward_func": 2.0,
308
- "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 0.25,
310
- "rewards/question_recreation_reward_func": 1.0,
311
  "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.5,
313
- "rewards/xmlcount_reward_func": 1.25,
314
- "step": 32
315
- },
316
- {
317
- "completion_length": 124.71875,
318
- "epoch": 0.34,
319
- "grad_norm": 0.15347696840763092,
320
- "kl": 0.0032288330085066264,
321
- "learning_rate": 9.221521041893904e-07,
322
- "loss": 0.0,
323
- "reward": 7.154421359300613,
324
- "reward_std": 0.004177047871053219,
325
- "rewards/concensus_correctness_reward_func": 2.157375007867813,
326
- "rewards/consensus_reward_func": 2.0,
327
- "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 0.25,
329
- "rewards/question_recreation_reward_func": 0.9970463812351227,
330
- "rewards/soft_format_reward_func": 0.0,
331
- "rewards/strict_format_reward_func": 0.5,
332
- "rewards/xmlcount_reward_func": 1.25,
333
- "step": 34
334
- },
335
- {
336
- "completion_length": 124.125,
337
- "epoch": 0.36,
338
- "grad_norm": 0.20969295501708984,
339
- "kl": 0.006022399147695978,
340
- "learning_rate": 8.88711566694483e-07,
341
- "loss": 0.0001,
342
- "reward": 7.163678020238876,
343
- "reward_std": 0.00010184785787714645,
344
- "rewards/concensus_correctness_reward_func": 2.163750007748604,
345
- "rewards/consensus_reward_func": 2.0,
346
- "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 0.25,
348
- "rewards/question_recreation_reward_func": 0.9999279975891113,
349
- "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.5,
351
- "rewards/xmlcount_reward_func": 1.25,
352
- "step": 36
353
- },
354
- {
355
- "completion_length": 113.40625,
356
- "epoch": 0.38,
357
- "grad_norm": 0.016258783638477325,
358
- "kl": 0.0014696058724439354,
359
- "learning_rate": 8.540600725544942e-07,
360
- "loss": 0.0,
361
- "reward": 6.662374943494797,
362
- "reward_std": 0.0,
363
- "rewards/concensus_correctness_reward_func": 1.9123750254511833,
364
- "rewards/consensus_reward_func": 2.0,
365
- "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 0.0,
367
- "rewards/question_recreation_reward_func": 1.0,
368
- "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.5,
370
- "rewards/xmlcount_reward_func": 1.25,
371
- "step": 38
372
- },
373
- {
374
- "completion_length": 105.71875,
375
- "epoch": 0.4,
376
- "grad_norm": 0.3598341643810272,
377
- "kl": 0.011058556609668813,
378
- "learning_rate": 8.183429621780925e-07,
379
- "loss": 0.0002,
380
- "reward": 7.289874881505966,
381
- "reward_std": 0.0,
382
- "rewards/concensus_correctness_reward_func": 2.289875015616417,
383
- "rewards/consensus_reward_func": 2.0,
384
- "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 0.25,
386
- "rewards/question_recreation_reward_func": 1.0,
387
- "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.5,
389
- "rewards/xmlcount_reward_func": 1.25,
390
- "step": 40
391
- },
392
- {
393
- "completion_length": 105.59375,
394
- "epoch": 0.42,
395
- "grad_norm": 0.01676054112613201,
396
- "kl": 0.001547790251834158,
397
- "learning_rate": 7.817100455387703e-07,
398
- "loss": 0.0,
399
- "reward": 7.168750017881393,
400
- "reward_std": 0.0,
401
- "rewards/concensus_correctness_reward_func": 2.1687499955296516,
402
- "rewards/consensus_reward_func": 2.0,
403
- "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 0.25,
405
- "rewards/question_recreation_reward_func": 1.0,
406
- "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.5,
408
- "rewards/xmlcount_reward_func": 1.25,
409
- "step": 42
410
- },
411
- {
412
- "completion_length": 116.09375,
413
- "epoch": 0.44,
414
- "grad_norm": 0.040649134665727615,
415
- "kl": 0.0027483347312227124,
416
- "learning_rate": 7.443149738197521e-07,
417
- "loss": 0.0,
418
- "reward": 7.121432453393936,
419
- "reward_std": 0.008820239454507828,
420
- "rewards/concensus_correctness_reward_func": 2.137750022113323,
421
- "rewards/consensus_reward_func": 2.0,
422
- "rewards/cumulative_reward_2": 0.0,
423
- "rewards/final_correctness_reward_func": 0.25,
424
- "rewards/question_recreation_reward_func": 0.9836825020611286,
425
- "rewards/soft_format_reward_func": 0.0,
426
- "rewards/strict_format_reward_func": 0.5,
427
- "rewards/xmlcount_reward_func": 1.25,
428
- "step": 44
429
- },
430
- {
431
- "completion_length": 114.40625,
432
- "epoch": 0.46,
433
- "grad_norm": 0.01834481954574585,
434
- "kl": 0.011484232741167943,
435
- "learning_rate": 7.063145949475337e-07,
436
- "loss": 0.0002,
437
- "reward": 7.043624967336655,
438
- "reward_std": 0.0,
439
- "rewards/concensus_correctness_reward_func": 2.168625019490719,
440
- "rewards/consensus_reward_func": 1.875,
441
- "rewards/cumulative_reward_2": 0.0,
442
- "rewards/final_correctness_reward_func": 0.25,
443
- "rewards/question_recreation_reward_func": 1.0,
444
- "rewards/soft_format_reward_func": 0.0,
445
- "rewards/strict_format_reward_func": 0.5,
446
- "rewards/xmlcount_reward_func": 1.25,
447
- "step": 46
448
- },
449
- {
450
- "completion_length": 120.0,
451
- "epoch": 0.48,
452
- "grad_norm": 0.04231150075793266,
453
- "kl": 0.017316492816462414,
454
- "learning_rate": 6.678682957171706e-07,
455
- "loss": 0.0003,
456
- "reward": 6.907124996185303,
457
- "reward_std": 0.0,
458
- "rewards/concensus_correctness_reward_func": 2.0321250036358833,
459
- "rewards/consensus_reward_func": 2.0,
460
- "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 0.125,
462
- "rewards/question_recreation_reward_func": 1.0,
463
- "rewards/soft_format_reward_func": 0.0,
464
- "rewards/strict_format_reward_func": 0.5,
465
- "rewards/xmlcount_reward_func": 1.25,
466
- "step": 48
467
- },
468
- {
469
- "completion_length": 105.71875,
470
- "epoch": 0.5,
471
- "grad_norm": 0.9996789693832397,
472
- "kl": 0.0018611156865517842,
473
- "learning_rate": 6.29137333268669e-07,
474
- "loss": 0.0,
475
- "reward": 7.403761506080627,
476
- "reward_std": 0.00033736444311216474,
477
- "rewards/concensus_correctness_reward_func": 2.2790000066161156,
478
- "rewards/consensus_reward_func": 2.0,
479
- "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 0.375,
481
- "rewards/question_recreation_reward_func": 0.999761451035738,
482
- "rewards/soft_format_reward_func": 0.0,
483
- "rewards/strict_format_reward_func": 0.5,
484
- "rewards/xmlcount_reward_func": 1.25,
485
- "step": 50
486
- },
487
- {
488
- "completion_length": 111.34375,
489
- "epoch": 0.52,
490
- "grad_norm": 0.04733134061098099,
491
- "kl": 0.0036280953381719883,
492
- "learning_rate": 5.902841587184991e-07,
493
- "loss": 0.0001,
494
- "reward": 6.6659999787807465,
495
- "reward_std": 0.0,
496
- "rewards/concensus_correctness_reward_func": 1.9160000085830688,
497
- "rewards/consensus_reward_func": 2.0,
498
- "rewards/cumulative_reward_2": 0.0,
499
- "rewards/final_correctness_reward_func": 0.0,
500
- "rewards/question_recreation_reward_func": 1.0,
501
- "rewards/soft_format_reward_func": 0.0,
502
- "rewards/strict_format_reward_func": 0.5,
503
- "rewards/xmlcount_reward_func": 1.25,
504
- "step": 52
505
- },
506
- {
507
- "completion_length": 115.375,
508
- "epoch": 0.54,
509
- "grad_norm": 0.0800234004855156,
510
- "kl": 0.014154426544337184,
511
- "learning_rate": 5.514717357831576e-07,
512
- "loss": 0.0002,
513
- "reward": 7.356687515974045,
514
- "reward_std": 0.26171789318323135,
515
- "rewards/concensus_correctness_reward_func": 2.294187508523464,
516
- "rewards/consensus_reward_func": 1.875,
517
- "rewards/cumulative_reward_2": 0.0,
518
- "rewards/final_correctness_reward_func": 0.4375,
519
- "rewards/question_recreation_reward_func": 1.0,
520
- "rewards/soft_format_reward_func": 0.0,
521
- "rewards/strict_format_reward_func": 0.5,
522
- "rewards/xmlcount_reward_func": 1.25,
523
- "step": 54
524
- },
525
- {
526
- "completion_length": 116.65625,
527
- "epoch": 0.56,
528
- "grad_norm": 0.05929900333285332,
529
- "kl": 0.010289387573720887,
530
- "learning_rate": 5.12862857252706e-07,
531
- "loss": 0.0002,
532
- "reward": 7.364749997854233,
533
- "reward_std": 0.0,
534
- "rewards/concensus_correctness_reward_func": 2.239749997854233,
535
- "rewards/consensus_reward_func": 2.0,
536
- "rewards/cumulative_reward_2": 0.0,
537
- "rewards/final_correctness_reward_func": 0.375,
538
- "rewards/question_recreation_reward_func": 1.0,
539
- "rewards/soft_format_reward_func": 0.0,
540
- "rewards/strict_format_reward_func": 0.5,
541
- "rewards/xmlcount_reward_func": 1.25,
542
- "step": 56
543
- },
544
- {
545
- "completion_length": 113.65625,
546
- "epoch": 0.58,
547
- "grad_norm": 0.08536501228809357,
548
- "kl": 0.0039269381459234864,
549
- "learning_rate": 4.7461946218123493e-07,
550
- "loss": 0.0001,
551
- "reward": 6.906499952077866,
552
- "reward_std": 0.0,
553
- "rewards/concensus_correctness_reward_func": 2.031499996781349,
554
- "rewards/consensus_reward_func": 2.0,
555
- "rewards/cumulative_reward_2": 0.0,
556
- "rewards/final_correctness_reward_func": 0.125,
557
- "rewards/question_recreation_reward_func": 1.0,
558
- "rewards/soft_format_reward_func": 0.0,
559
- "rewards/strict_format_reward_func": 0.5,
560
- "rewards/xmlcount_reward_func": 1.25,
561
- "step": 58
562
- },
563
- {
564
- "completion_length": 112.6875,
565
- "epoch": 0.6,
566
- "grad_norm": 0.0589258149266243,
567
- "kl": 0.005026609829656081,
568
- "learning_rate": 4.369019566581942e-07,
569
- "loss": 0.0001,
570
- "reward": 7.163875013589859,
571
- "reward_std": 0.0,
572
- "rewards/concensus_correctness_reward_func": 2.163875013589859,
573
- "rewards/consensus_reward_func": 2.0,
574
- "rewards/cumulative_reward_2": 0.0,
575
- "rewards/final_correctness_reward_func": 0.25,
576
- "rewards/question_recreation_reward_func": 1.0,
577
- "rewards/soft_format_reward_func": 0.0,
578
- "rewards/strict_format_reward_func": 0.5,
579
- "rewards/xmlcount_reward_func": 1.25,
580
- "step": 60
581
- },
582
- {
583
- "completion_length": 115.5,
584
- "epoch": 0.62,
585
- "grad_norm": 1.4010002613067627,
586
- "kl": 0.01616358994942857,
587
- "learning_rate": 3.9986854100950773e-07,
588
- "loss": 0.0002,
589
- "reward": 6.842124968767166,
590
- "reward_std": 0.0,
591
- "rewards/concensus_correctness_reward_func": 1.9671249985694885,
592
- "rewards/consensus_reward_func": 2.0,
593
- "rewards/cumulative_reward_2": 0.0,
594
- "rewards/final_correctness_reward_func": 0.125,
595
- "rewards/question_recreation_reward_func": 1.0,
596
- "rewards/soft_format_reward_func": 0.0,
597
- "rewards/strict_format_reward_func": 0.5,
598
- "rewards/xmlcount_reward_func": 1.25,
599
- "step": 62
600
- },
601
- {
602
- "completion_length": 110.21875,
603
- "epoch": 0.64,
604
- "grad_norm": 0.04362049698829651,
605
- "kl": 0.009225194641658163,
606
- "learning_rate": 3.6367454625042783e-07,
607
- "loss": 0.0001,
608
- "reward": 7.585927993059158,
609
- "reward_std": 0.0,
610
- "rewards/concensus_correctness_reward_func": 2.3382499888539314,
611
- "rewards/consensus_reward_func": 2.0,
612
- "rewards/cumulative_reward_2": 0.0,
613
- "rewards/final_correctness_reward_func": 0.5,
614
- "rewards/question_recreation_reward_func": 0.9976780191063881,
615
- "rewards/soft_format_reward_func": 0.0,
616
- "rewards/strict_format_reward_func": 0.5,
617
- "rewards/xmlcount_reward_func": 1.25,
618
- "step": 64
619
- },
620
- {
621
- "completion_length": 117.0625,
622
- "epoch": 0.66,
623
- "grad_norm": 0.04796240106225014,
624
- "kl": 0.0031241076030710246,
625
- "learning_rate": 3.284717825732767e-07,
626
- "loss": 0.0,
627
- "reward": 6.894343733787537,
628
- "reward_std": 0.027621358633041382,
629
- "rewards/concensus_correctness_reward_func": 2.038875013589859,
630
- "rewards/consensus_reward_func": 2.0,
631
- "rewards/cumulative_reward_2": 0.0,
632
- "rewards/final_correctness_reward_func": 0.125,
633
- "rewards/question_recreation_reward_func": 1.0,
634
- "rewards/soft_format_reward_func": 0.0,
635
- "rewards/strict_format_reward_func": 0.484375,
636
- "rewards/xmlcount_reward_func": 1.24609375,
637
- "step": 66
638
- },
639
- {
640
- "completion_length": 120.53125,
641
- "epoch": 0.68,
642
- "grad_norm": 0.6480430960655212,
643
- "kl": 0.06253671760896395,
644
- "learning_rate": 2.944079026027424e-07,
645
- "loss": 0.0009,
646
- "reward": 7.561367034912109,
647
- "reward_std": 0.08847127005719813,
648
- "rewards/concensus_correctness_reward_func": 2.374874994158745,
649
- "rewards/consensus_reward_func": 2.0,
650
- "rewards/cumulative_reward_2": 0.0,
651
- "rewards/final_correctness_reward_func": 0.4375,
652
- "rewards/question_recreation_reward_func": 0.9989920035004616,
653
- "rewards/soft_format_reward_func": 0.0,
654
- "rewards/strict_format_reward_func": 0.5,
655
- "rewards/xmlcount_reward_func": 1.25,
656
- "step": 68
657
- },
658
- {
659
- "completion_length": 109.6875,
660
- "epoch": 0.7,
661
- "grad_norm": 0.05288118124008179,
662
- "kl": 0.10043014965231123,
663
- "learning_rate": 2.6162578208945604e-07,
664
- "loss": 0.0015,
665
- "reward": 7.849874943494797,
666
- "reward_std": 0.0,
667
- "rewards/concensus_correctness_reward_func": 2.474875002168119,
668
- "rewards/consensus_reward_func": 1.875,
669
- "rewards/cumulative_reward_2": 0.0,
670
- "rewards/final_correctness_reward_func": 0.75,
671
- "rewards/question_recreation_reward_func": 1.0,
672
- "rewards/soft_format_reward_func": 0.0,
673
- "rewards/strict_format_reward_func": 0.5,
674
- "rewards/xmlcount_reward_func": 1.25,
675
- "step": 70
676
- },
677
- {
678
- "completion_length": 110.3125,
679
- "epoch": 0.72,
680
- "grad_norm": 0.016276629641652107,
681
- "kl": 0.0029021119621575053,
682
- "learning_rate": 2.3026292063944357e-07,
683
- "loss": 0.0,
684
- "reward": 7.09412494301796,
685
- "reward_std": 0.0883883461356163,
686
- "rewards/concensus_correctness_reward_func": 2.156625010073185,
687
- "rewards/consensus_reward_func": 2.0,
688
- "rewards/cumulative_reward_2": 0.0,
689
- "rewards/final_correctness_reward_func": 0.1875,
690
- "rewards/question_recreation_reward_func": 1.0,
691
- "rewards/soft_format_reward_func": 0.0,
692
- "rewards/strict_format_reward_func": 0.5,
693
- "rewards/xmlcount_reward_func": 1.25,
694
- "step": 72
695
- },
696
- {
697
- "completion_length": 116.0625,
698
- "epoch": 0.74,
699
- "grad_norm": 0.0852767825126648,
700
- "kl": 0.005175539316041977,
701
- "learning_rate": 2.0045086499299148e-07,
702
- "loss": 0.0001,
703
- "reward": 7.029082745313644,
704
- "reward_std": 0.0,
705
- "rewards/concensus_correctness_reward_func": 2.1545000076293945,
706
- "rewards/consensus_reward_func": 1.875,
707
- "rewards/cumulative_reward_2": 0.0,
708
- "rewards/final_correctness_reward_func": 0.25,
709
- "rewards/question_recreation_reward_func": 0.9995827786624432,
710
- "rewards/soft_format_reward_func": 0.0,
711
- "rewards/strict_format_reward_func": 0.5,
712
- "rewards/xmlcount_reward_func": 1.25,
713
- "step": 74
714
- },
715
- {
716
- "completion_length": 118.71875,
717
- "epoch": 0.76,
718
- "grad_norm": 0.11004471033811569,
719
- "kl": 0.006573738556653552,
720
- "learning_rate": 1.723146572719026e-07,
721
- "loss": 0.0001,
722
- "reward": 6.78612494468689,
723
- "reward_std": 0.0,
724
- "rewards/concensus_correctness_reward_func": 1.9111249893903732,
725
- "rewards/consensus_reward_func": 2.0,
726
- "rewards/cumulative_reward_2": 0.0,
727
- "rewards/final_correctness_reward_func": 0.125,
728
- "rewards/question_recreation_reward_func": 1.0,
729
- "rewards/soft_format_reward_func": 0.0,
730
- "rewards/strict_format_reward_func": 0.5,
731
- "rewards/xmlcount_reward_func": 1.25,
732
- "step": 76
733
- },
734
- {
735
- "completion_length": 109.6875,
736
- "epoch": 0.78,
737
- "grad_norm": 0.012919344939291477,
738
- "kl": 0.005183771483643795,
739
- "learning_rate": 1.459723105093828e-07,
740
- "loss": 0.0001,
741
- "reward": 6.658572971820831,
742
- "reward_std": 7.354625267907977e-05,
743
- "rewards/concensus_correctness_reward_func": 1.9086250066757202,
744
- "rewards/consensus_reward_func": 2.0,
745
- "rewards/cumulative_reward_2": 0.0,
746
- "rewards/final_correctness_reward_func": 0.0,
747
- "rewards/question_recreation_reward_func": 0.9999480023980141,
748
- "rewards/soft_format_reward_func": 0.0,
749
- "rewards/strict_format_reward_func": 0.5,
750
- "rewards/xmlcount_reward_func": 1.25,
751
- "step": 78
752
- },
753
- {
754
- "completion_length": 112.875,
755
- "epoch": 0.8,
756
- "grad_norm": 0.021166132763028145,
757
- "kl": 0.007018942174909171,
758
- "learning_rate": 1.2153431366236765e-07,
759
- "loss": 0.0001,
760
- "reward": 6.880374938249588,
761
- "reward_std": 0.0,
762
- "rewards/concensus_correctness_reward_func": 2.0053750053048134,
763
- "rewards/consensus_reward_func": 2.0,
764
- "rewards/cumulative_reward_2": 0.0,
765
- "rewards/final_correctness_reward_func": 0.125,
766
- "rewards/question_recreation_reward_func": 1.0,
767
- "rewards/soft_format_reward_func": 0.0,
768
- "rewards/strict_format_reward_func": 0.5,
769
- "rewards/xmlcount_reward_func": 1.25,
770
- "step": 80
771
- },
772
- {
773
- "completion_length": 118.28125,
774
- "epoch": 0.82,
775
- "grad_norm": 0.7134923338890076,
776
- "kl": 0.01929802937729619,
777
- "learning_rate": 9.910316818243741e-08,
778
- "loss": 0.0003,
779
- "reward": 7.154624938964844,
780
- "reward_std": 0.0,
781
- "rewards/concensus_correctness_reward_func": 2.154625006020069,
782
- "rewards/consensus_reward_func": 2.0,
783
- "rewards/cumulative_reward_2": 0.0,
784
- "rewards/final_correctness_reward_func": 0.25,
785
- "rewards/question_recreation_reward_func": 1.0,
786
- "rewards/soft_format_reward_func": 0.0,
787
- "rewards/strict_format_reward_func": 0.5,
788
- "rewards/xmlcount_reward_func": 1.25,
789
- "step": 82
790
- },
791
- {
792
- "completion_length": 115.1875,
793
- "epoch": 0.84,
794
- "grad_norm": 1.3774880170822144,
795
- "kl": 0.005105306326640857,
796
- "learning_rate": 7.877295808910932e-08,
797
- "loss": 0.0001,
798
- "reward": 6.884448528289795,
799
- "reward_std": 0.022340359166264534,
800
- "rewards/concensus_correctness_reward_func": 2.025500014424324,
801
- "rewards/consensus_reward_func": 2.0,
802
- "rewards/cumulative_reward_2": 0.0,
803
- "rewards/final_correctness_reward_func": 0.125,
804
- "rewards/question_recreation_reward_func": 0.9995735697448254,
805
- "rewards/soft_format_reward_func": 0.0,
806
- "rewards/strict_format_reward_func": 0.484375,
807
- "rewards/xmlcount_reward_func": 1.25,
808
- "step": 84
809
- },
810
- {
811
- "completion_length": 113.625,
812
- "epoch": 0.86,
813
- "grad_norm": 0.03088958002626896,
814
- "kl": 0.0026084017076755117,
815
- "learning_rate": 6.062895534876287e-08,
816
- "loss": 0.0,
817
- "reward": 6.6597499549388885,
818
- "reward_std": 0.0,
819
- "rewards/concensus_correctness_reward_func": 1.9097500145435333,
820
- "rewards/consensus_reward_func": 2.0,
821
- "rewards/cumulative_reward_2": 0.0,
822
- "rewards/final_correctness_reward_func": 0.0,
823
- "rewards/question_recreation_reward_func": 1.0,
824
- "rewards/soft_format_reward_func": 0.0,
825
- "rewards/strict_format_reward_func": 0.5,
826
- "rewards/xmlcount_reward_func": 1.25,
827
- "step": 86
828
- },
829
- {
830
- "completion_length": 117.15625,
831
- "epoch": 0.88,
832
- "grad_norm": 0.06499552726745605,
833
- "kl": 0.004322356182001386,
834
- "learning_rate": 4.474726221437752e-08,
835
- "loss": 0.0001,
836
- "reward": 7.145511776208878,
837
- "reward_std": 0.0,
838
- "rewards/concensus_correctness_reward_func": 2.1503750011324883,
839
- "rewards/consensus_reward_func": 2.0,
840
- "rewards/cumulative_reward_2": 0.0,
841
- "rewards/final_correctness_reward_func": 0.25,
842
- "rewards/question_recreation_reward_func": 0.995136771351099,
843
- "rewards/soft_format_reward_func": 0.0,
844
- "rewards/strict_format_reward_func": 0.5,
845
- "rewards/xmlcount_reward_func": 1.25,
846
- "step": 88
847
- },
848
- {
849
- "completion_length": 119.4375,
850
- "epoch": 0.9,
851
- "grad_norm": 0.09833624958992004,
852
- "kl": 0.006649176628343412,
853
- "learning_rate": 3.119449202623774e-08,
854
- "loss": 0.0001,
855
- "reward": 6.901250004768372,
856
- "reward_std": 0.0,
857
- "rewards/concensus_correctness_reward_func": 2.0262500047683716,
858
- "rewards/consensus_reward_func": 2.0,
859
- "rewards/cumulative_reward_2": 0.0,
860
- "rewards/final_correctness_reward_func": 0.125,
861
- "rewards/question_recreation_reward_func": 1.0,
862
- "rewards/soft_format_reward_func": 0.0,
863
- "rewards/strict_format_reward_func": 0.5,
864
- "rewards/xmlcount_reward_func": 1.25,
865
- "step": 90
866
- },
867
- {
868
- "completion_length": 111.125,
869
- "epoch": 0.92,
870
- "grad_norm": 0.029738876968622208,
871
- "kl": 0.003505616401525913,
872
- "learning_rate": 2.0027489812433317e-08,
873
- "loss": 0.0001,
874
- "reward": 6.53612494468689,
875
- "reward_std": 0.0,
876
- "rewards/concensus_correctness_reward_func": 1.9111249893903732,
877
- "rewards/consensus_reward_func": 1.875,
878
- "rewards/cumulative_reward_2": 0.0,
879
- "rewards/final_correctness_reward_func": 0.0,
880
- "rewards/question_recreation_reward_func": 1.0,
881
- "rewards/soft_format_reward_func": 0.0,
882
- "rewards/strict_format_reward_func": 0.5,
883
- "rewards/xmlcount_reward_func": 1.25,
884
- "step": 92
885
- },
886
- {
887
- "completion_length": 123.40625,
888
- "epoch": 0.94,
889
- "grad_norm": 0.026749616488814354,
890
- "kl": 0.0031833851862757,
891
- "learning_rate": 1.1293093861056569e-08,
892
- "loss": 0.0,
893
- "reward": 7.906374961137772,
894
- "reward_std": 0.0,
895
- "rewards/concensus_correctness_reward_func": 2.5313749983906746,
896
- "rewards/consensus_reward_func": 2.0,
897
- "rewards/cumulative_reward_2": 0.0,
898
- "rewards/final_correctness_reward_func": 0.625,
899
- "rewards/question_recreation_reward_func": 1.0,
900
- "rewards/soft_format_reward_func": 0.0,
901
- "rewards/strict_format_reward_func": 0.5,
902
- "rewards/xmlcount_reward_func": 1.25,
903
- "step": 94
904
- },
905
- {
906
- "completion_length": 124.125,
907
- "epoch": 0.96,
908
- "grad_norm": 0.7131932973861694,
909
- "kl": 0.0020799475960302516,
910
- "learning_rate": 5.027939264144954e-09,
911
- "loss": 0.0,
912
- "reward": 7.410148322582245,
913
- "reward_std": 0.0004973328104824759,
914
- "rewards/concensus_correctness_reward_func": 2.285500004887581,
915
- "rewards/consensus_reward_func": 2.0,
916
- "rewards/cumulative_reward_2": 0.0,
917
- "rewards/final_correctness_reward_func": 0.375,
918
- "rewards/question_recreation_reward_func": 0.9996483325958252,
919
- "rewards/soft_format_reward_func": 0.0,
920
- "rewards/strict_format_reward_func": 0.5,
921
- "rewards/xmlcount_reward_func": 1.25,
922
- "step": 96
923
- },
924
- {
925
- "completion_length": 118.875,
926
- "epoch": 0.98,
927
- "grad_norm": 0.04078225791454315,
928
- "kl": 0.008958728666584648,
929
- "learning_rate": 1.2583042573730107e-09,
930
- "loss": 0.0001,
931
- "reward": 7.291375011205673,
932
- "reward_std": 0.0,
933
- "rewards/concensus_correctness_reward_func": 2.2913750037550926,
934
- "rewards/consensus_reward_func": 1.875,
935
- "rewards/cumulative_reward_2": 0.0,
936
- "rewards/final_correctness_reward_func": 0.375,
937
- "rewards/question_recreation_reward_func": 1.0,
938
- "rewards/soft_format_reward_func": 0.0,
939
- "rewards/strict_format_reward_func": 0.5,
940
- "rewards/xmlcount_reward_func": 1.25,
941
- "step": 98
942
  },
943
  {
944
- "completion_length": 104.84375,
945
- "epoch": 1.0,
946
- "grad_norm": 0.013484155759215355,
947
- "kl": 0.002459692420416104,
948
  "learning_rate": 0.0,
949
  "loss": 0.0,
950
- "reward": 7.410000026226044,
951
- "reward_std": 0.0,
952
- "rewards/concensus_correctness_reward_func": 2.285000003874302,
953
- "rewards/consensus_reward_func": 2.0,
954
  "rewards/cumulative_reward_2": 0.0,
955
- "rewards/final_correctness_reward_func": 0.375,
956
- "rewards/question_recreation_reward_func": 1.0,
957
  "rewards/soft_format_reward_func": 0.0,
958
- "rewards/strict_format_reward_func": 0.5,
959
- "rewards/xmlcount_reward_func": 1.25,
960
- "step": 100
961
  },
962
  {
963
- "epoch": 1.0,
964
- "step": 100,
965
  "total_flos": 0.0,
966
- "train_loss": 0.00012401715792293545,
967
- "train_runtime": 473.4063,
968
- "train_samples_per_second": 3.38,
969
- "train_steps_per_second": 0.211
970
  }
971
  ],
972
  "logging_steps": 2,
973
- "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
  "num_train_epochs": 1,
976
  "save_steps": 25,
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2671118530884808,
6
  "eval_steps": 500,
7
+ "global_step": 20,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 299.34375,
14
+ "epoch": 0.02671118530884808,
15
+ "grad_norm": 94.53134155273438,
16
  "kl": 0.0,
17
+ "learning_rate": 4.965903258506806e-07,
18
  "loss": 0.0,
19
+ "reward": 0.4095814856700599,
20
+ "reward_std": 0.24939912266563624,
21
+ "rewards/concensus_correctness_reward_func": 0.0,
22
+ "rewards/consensus_reward_func": 0.0625,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 0.25,
25
+ "rewards/question_recreation_reward_func": 0.17958148289471865,
26
  "rewards/soft_format_reward_func": 0.0,
27
+ "rewards/strict_format_reward_func": 0.0,
28
+ "rewards/xmlcount_reward_func": -0.08249999745748937,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 316.9375,
33
+ "epoch": 0.05342237061769616,
34
+ "grad_norm": 9.352435111999512,
35
+ "kl": 0.0014936136085452745,
36
+ "learning_rate": 4.698684378016222e-07,
37
  "loss": 0.0,
38
+ "reward": 1.0824220394715667,
39
+ "reward_std": 1.2490277905308176,
40
+ "rewards/concensus_correctness_reward_func": 0.625,
41
+ "rewards/consensus_reward_func": 0.0625,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.0625,
44
+ "rewards/question_recreation_reward_func": 0.1588594950735569,
45
  "rewards/soft_format_reward_func": 0.0,
46
+ "rewards/strict_format_reward_func": 0.0,
47
+ "rewards/xmlcount_reward_func": 0.1735625034198165,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 432.5,
52
+ "epoch": 0.08013355592654424,
53
+ "grad_norm": 72.27102661132812,
54
+ "kl": 0.0026759140364447376,
55
+ "learning_rate": 4.193203929064353e-07,
56
  "loss": 0.0,
57
+ "reward": 0.26975517414393835,
58
+ "reward_std": 0.23289441590895876,
59
+ "rewards/concensus_correctness_reward_func": 0.0,
60
+ "rewards/consensus_reward_func": 0.0,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 0.0,
63
+ "rewards/question_recreation_reward_func": 0.16150517805363052,
64
  "rewards/soft_format_reward_func": 0.0,
65
+ "rewards/strict_format_reward_func": 0.0,
66
+ "rewards/xmlcount_reward_func": 0.10824999865144491,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 424.3125,
71
+ "epoch": 0.10684474123539232,
72
+ "grad_norm": 57.67626190185547,
73
+ "kl": 0.003170746138493996,
74
+ "learning_rate": 3.5042385616324236e-07,
75
  "loss": 0.0,
76
+ "reward": 0.38037113891914487,
77
+ "reward_std": 0.40154265408637,
78
+ "rewards/concensus_correctness_reward_func": 0.0,
79
+ "rewards/consensus_reward_func": 0.0,
80
  "rewards/cumulative_reward_2": 0.0,
81
  "rewards/final_correctness_reward_func": 0.125,
82
+ "rewards/question_recreation_reward_func": 0.14277737913653255,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.0,
85
+ "rewards/xmlcount_reward_func": 0.11259375081135659,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 388.5,
90
+ "epoch": 0.1335559265442404,
91
+ "grad_norm": 5.881807804107666,
92
+ "kl": 0.008150014091370394,
93
+ "learning_rate": 2.706448363680831e-07,
94
  "loss": 0.0,
95
+ "reward": 0.009647406113799661,
96
+ "reward_std": 0.48192066283809254,
97
+ "rewards/concensus_correctness_reward_func": 0.0,
98
+ "rewards/consensus_reward_func": 0.0,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.0,
101
+ "rewards/question_recreation_reward_func": 0.11724113661330193,
102
  "rewards/soft_format_reward_func": 0.0,
103
+ "rewards/strict_format_reward_func": 0.0,
104
+ "rewards/xmlcount_reward_func": -0.10759374493500218,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 329.3125,
109
+ "epoch": 0.16026711185308848,
110
+ "grad_norm": 32.46157455444336,
111
+ "kl": 0.0016044293206505245,
112
+ "learning_rate": 1.886286282148002e-07,
113
+ "loss": 0.0,
114
+ "reward": 0.3704580693738535,
115
+ "reward_std": 0.46280903373553883,
116
+ "rewards/concensus_correctness_reward_func": 0.0,
117
+ "rewards/consensus_reward_func": 0.0,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 0.0625,
120
+ "rewards/question_recreation_reward_func": 0.19455181124794763,
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.015625,
123
+ "rewards/xmlcount_reward_func": 0.09778124839067459,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 385.28125,
128
+ "epoch": 0.18697829716193656,
129
+ "grad_norm": 63.96397018432617,
130
+ "kl": 0.019191041095837136,
131
+ "learning_rate": 1.1326296046939333e-07,
132
  "loss": 0.0,
133
+ "reward": 0.7466699600918218,
134
+ "reward_std": 1.2343180573134873,
135
+ "rewards/concensus_correctness_reward_func": 0.625,
136
+ "rewards/consensus_reward_func": 0.0,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 0.0625,
139
+ "rewards/question_recreation_reward_func": 0.1140449601225555,
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.0,
142
+ "rewards/xmlcount_reward_func": -0.05487499665468931,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 381.03125,
147
+ "epoch": 0.21368948247078465,
148
+ "grad_norm": 12.218117713928223,
149
+ "kl": 0.003359480255312519,
150
+ "learning_rate": 5.271487265090163e-08,
151
  "loss": 0.0,
152
+ "reward": 0.4025605304632336,
153
+ "reward_std": 0.7095250401762314,
154
+ "rewards/concensus_correctness_reward_func": 0.0,
155
+ "rewards/consensus_reward_func": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  "rewards/cumulative_reward_2": 0.0,
157
  "rewards/final_correctness_reward_func": 0.125,
158
+ "rewards/question_recreation_reward_func": 0.23962301563005894,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.0,
161
+ "rewards/xmlcount_reward_func": 0.037937495624646544,
162
+ "step": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  },
164
  {
165
+ "completion_length": 261.03125,
166
+ "epoch": 0.24040066777963273,
167
+ "grad_norm": 9.750836372375488,
168
+ "kl": 0.0017970694570976775,
169
+ "learning_rate": 1.3545689574841341e-08,
170
  "loss": 0.0,
171
+ "reward": 0.4833010680740699,
172
+ "reward_std": 0.5299922852864256,
173
+ "rewards/concensus_correctness_reward_func": 0.0,
174
+ "rewards/consensus_reward_func": 0.0625,
175
  "rewards/cumulative_reward_2": 0.0,
176
  "rewards/final_correctness_reward_func": 0.0625,
177
+ "rewards/question_recreation_reward_func": 0.2038323215674609,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  "rewards/soft_format_reward_func": 0.0,
179
+ "rewards/strict_format_reward_func": 0.0,
180
+ "rewards/xmlcount_reward_func": 0.15446875197812915,
181
+ "step": 18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  },
183
  {
184
+ "completion_length": 358.25,
185
+ "epoch": 0.2671118530884808,
186
+ "grad_norm": 30.307376861572266,
187
+ "kl": 0.0023762249620631337,
188
  "learning_rate": 0.0,
189
  "loss": 0.0,
190
+ "reward": 0.5694868991849944,
191
+ "reward_std": 0.5472512795531657,
192
+ "rewards/concensus_correctness_reward_func": 0.05999999865889549,
193
+ "rewards/consensus_reward_func": 0.0625,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.0625,
196
+ "rewards/question_recreation_reward_func": 0.25226815952919424,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.0,
199
+ "rewards/xmlcount_reward_func": 0.13221875578165054,
200
+ "step": 20
201
  },
202
  {
203
+ "epoch": 0.2671118530884808,
204
+ "step": 20,
205
  "total_flos": 0.0,
206
+ "train_loss": 4.617630838765763e-06,
207
+ "train_runtime": 463.4946,
208
+ "train_samples_per_second": 0.69,
209
+ "train_steps_per_second": 0.043
210
  }
211
  ],
212
  "logging_steps": 2,
213
+ "max_steps": 20,
214
  "num_input_tokens_seen": 0,
215
  "num_train_epochs": 1,
216
  "save_steps": 25,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:532bf7f1a94350a92f8068625b70869e3de110bca42146896a6b0ee266ca450a
3
  size 6008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76acb6803d24a05a0407ebe59d186d61c08a8dec84cd1a5c3e957bf419604560
3
  size 6008