shashwat-nandan commited on
Commit
11b4c0b
·
verified ·
1 Parent(s): ea10513

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.09253680892288685,
4
- "train_runtime": 133428.0905,
5
- "train_samples": 6,
6
  "train_samples_per_second": 0.001,
7
  "train_steps_per_second": 0.0
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.13233287669718266,
4
+ "train_runtime": 164161.8368,
5
+ "train_samples": 17,
6
  "train_samples_per_second": 0.001,
7
  "train_steps_per_second": 0.0
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfc539f081d55c30d556844ce743afa9621005237355f33123dd09554b9e76cb
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b1555ac512f53c15e1bd9d4da2105f74129d6491fcaf701e885d7bceb5246d0
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.09253680892288685,
4
- "train_runtime": 133428.0905,
5
- "train_samples": 6,
6
  "train_samples_per_second": 0.001,
7
  "train_steps_per_second": 0.0
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.13233287669718266,
4
+ "train_runtime": 164161.8368,
5
+ "train_samples": 17,
6
  "train_samples_per_second": 0.001,
7
  "train_steps_per_second": 0.0
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 19.666666666666668,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
@@ -15,37 +15,37 @@
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
- "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 585.5,
20
- "completions/max_terminated_length": 585.5,
21
- "completions/mean_length": 164.9166717529297,
22
- "completions/mean_terminated_length": 164.9166717529297,
23
- "completions/min_length": 14.0,
24
- "completions/min_terminated_length": 14.0,
25
- "epoch": 1.6666666666666665,
26
- "grad_norm": 16.132144927978516,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
- "loss": -0.1438,
30
- "num_tokens": 6253.0,
31
- "reward": 0.09254130534827709,
32
- "reward_std": 0.03140645381063223,
33
- "rewards/concensus_correctness_reward_func/mean": 0.0,
34
- "rewards/concensus_correctness_reward_func/std": 0.0,
35
- "rewards/consensus_reward_func/mean": 0.0,
36
- "rewards/consensus_reward_func/std": 0.0,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
  "rewards/final_correctness_reward_func/mean": 0.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
- "rewards/question_recreation_reward_func/mean": 0.08829130791127682,
42
- "rewards/question_recreation_reward_func/std": 0.04877444030717015,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
  "rewards/strict_format_reward_func/mean": 0.0,
46
  "rewards/strict_format_reward_func/std": 0.0,
47
- "rewards/xmlcount_reward_func/mean": 0.004249999765306711,
48
- "rewards/xmlcount_reward_func/std": 0.01550806313753128,
49
  "step": 2
50
  },
51
  {
@@ -55,36 +55,36 @@
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
  "completions/clipped_ratio": 0.0,
58
- "completions/max_length": 247.0,
59
- "completions/max_terminated_length": 247.0,
60
- "completions/mean_length": 109.91666412353516,
61
- "completions/mean_terminated_length": 109.91666412353516,
62
- "completions/min_length": 7.5,
63
- "completions/min_terminated_length": 7.5,
64
- "epoch": 3.6666666666666665,
65
- "grad_norm": 7.240252494812012,
66
- "kl": 0.0010323685710318387,
67
  "learning_rate": 4.864543104251586e-07,
68
- "loss": 0.0967,
69
- "num_tokens": 12726.0,
70
- "reward": 0.1256694793701172,
71
- "reward_std": 0.3006303310394287,
72
  "rewards/concensus_correctness_reward_func/mean": 0.0,
73
  "rewards/concensus_correctness_reward_func/std": 0.0,
74
- "rewards/consensus_reward_func/mean": 0.125,
75
- "rewards/consensus_reward_func/std": 0.3535533845424652,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
- "rewards/question_recreation_reward_func/mean": 0.020169468596577644,
81
- "rewards/question_recreation_reward_func/std": 0.015208072261884809,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
  "rewards/strict_format_reward_func/mean": 0.0,
85
  "rewards/strict_format_reward_func/std": 0.0,
86
- "rewards/xmlcount_reward_func/mean": -0.019500000402331352,
87
- "rewards/xmlcount_reward_func/std": 0.2519115321338177,
88
  "step": 4
89
  },
90
  {
@@ -94,20 +94,20 @@
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
  "completions/clipped_ratio": 0.0,
97
- "completions/max_length": 318.5,
98
- "completions/max_terminated_length": 318.5,
99
- "completions/mean_length": 110.41666793823242,
100
- "completions/mean_terminated_length": 110.41666793823242,
101
- "completions/min_length": 5.0,
102
- "completions/min_terminated_length": 5.0,
103
- "epoch": 5.666666666666667,
104
- "grad_norm": 12.744539260864258,
105
- "kl": 0.001115468876378145,
106
  "learning_rate": 4.472851273490984e-07,
107
- "loss": 0.2975,
108
- "num_tokens": 18890.0,
109
- "reward": 0.03514278866350651,
110
- "reward_std": 0.029419854748994112,
111
  "rewards/concensus_correctness_reward_func/mean": 0.0,
112
  "rewards/concensus_correctness_reward_func/std": 0.0,
113
  "rewards/consensus_reward_func/mean": 0.0,
@@ -116,14 +116,14 @@
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
- "rewards/question_recreation_reward_func/mean": 0.01989278756082058,
120
- "rewards/question_recreation_reward_func/std": 0.01218721829354763,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
124
  "rewards/strict_format_reward_func/std": 0.0,
125
- "rewards/xmlcount_reward_func/mean": 0.015250000171363354,
126
- "rewards/xmlcount_reward_func/std": 0.04313351586461067,
127
  "step": 6
128
  },
129
  {
@@ -133,36 +133,36 @@
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
  "completions/clipped_ratio": 0.0,
136
- "completions/max_length": 330.5,
137
- "completions/max_terminated_length": 330.5,
138
- "completions/mean_length": 93.75000190734863,
139
- "completions/mean_terminated_length": 93.75000190734863,
140
- "completions/min_length": 6.0,
141
- "completions/min_terminated_length": 6.0,
142
- "epoch": 7.666666666666667,
143
- "grad_norm": 24.797382354736328,
144
- "kl": 0.0010699216581997462,
145
  "learning_rate": 3.867370395306068e-07,
146
- "loss": 0.3365,
147
- "num_tokens": 24139.0,
148
- "reward": 0.2419363632798195,
149
- "reward_std": 0.2218155935406685,
150
  "rewards/concensus_correctness_reward_func/mean": 0.0,
151
  "rewards/concensus_correctness_reward_func/std": 0.0,
152
- "rewards/consensus_reward_func/mean": 0.125,
153
- "rewards/consensus_reward_func/std": 0.3535533845424652,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
  "rewards/final_correctness_reward_func/mean": 0.0,
157
  "rewards/final_correctness_reward_func/std": 0.0,
158
- "rewards/question_recreation_reward_func/mean": 0.07118633016943932,
159
- "rewards/question_recreation_reward_func/std": 0.04484980972483754,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
  "rewards/strict_format_reward_func/mean": 0.0,
163
  "rewards/strict_format_reward_func/std": 0.0,
164
- "rewards/xmlcount_reward_func/mean": 0.04575000051409006,
165
- "rewards/xmlcount_reward_func/std": 0.09960854053497314,
166
  "step": 8
167
  },
168
  {
@@ -172,20 +172,20 @@
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
- "completions/max_length": 243.0,
176
- "completions/max_terminated_length": 243.0,
177
- "completions/mean_length": 128.16666412353516,
178
- "completions/mean_terminated_length": 128.16666412353516,
179
- "completions/min_length": 32.0,
180
- "completions/min_terminated_length": 32.0,
181
- "epoch": 9.666666666666666,
182
- "grad_norm": 26.717540740966797,
183
- "kl": 0.001343091771559557,
184
  "learning_rate": 3.1137137178519977e-07,
185
- "loss": -0.0579,
186
- "num_tokens": 30080.0,
187
- "reward": 0.07627854682505131,
188
- "reward_std": 0.056225333362817764,
189
  "rewards/concensus_correctness_reward_func/mean": 0.0,
190
  "rewards/concensus_correctness_reward_func/std": 0.0,
191
  "rewards/consensus_reward_func/mean": 0.0,
@@ -194,14 +194,14 @@
194
  "rewards/cumulative_reward_2/std": 0.0,
195
  "rewards/final_correctness_reward_func/mean": 0.0,
196
  "rewards/final_correctness_reward_func/std": 0.0,
197
- "rewards/question_recreation_reward_func/mean": 0.06927854334935546,
198
- "rewards/question_recreation_reward_func/std": 0.03935649152845144,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
  "rewards/strict_format_reward_func/mean": 0.0,
202
  "rewards/strict_format_reward_func/std": 0.0,
203
- "rewards/xmlcount_reward_func/mean": 0.007000000216066837,
204
- "rewards/xmlcount_reward_func/std": 0.05189000070095062,
205
  "step": 10
206
  },
207
  {
@@ -211,36 +211,36 @@
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
  "completions/clipped_ratio": 0.0,
214
- "completions/max_length": 150.0,
215
- "completions/max_terminated_length": 150.0,
216
- "completions/mean_length": 54.58333396911621,
217
- "completions/mean_terminated_length": 54.58333396911621,
218
- "completions/min_length": 9.0,
219
- "completions/min_terminated_length": 9.0,
220
- "epoch": 11.666666666666666,
221
- "grad_norm": 18.737913131713867,
222
- "kl": 0.00392270221709623,
223
  "learning_rate": 2.2935516363191693e-07,
224
- "loss": 0.1934,
225
- "num_tokens": 34949.0,
226
- "reward": 0.29006822407245636,
227
- "reward_std": 0.22484304383397102,
228
  "rewards/concensus_correctness_reward_func/mean": 0.0,
229
  "rewards/concensus_correctness_reward_func/std": 0.0,
230
- "rewards/consensus_reward_func/mean": 0.0,
231
- "rewards/consensus_reward_func/std": 0.0,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
- "rewards/final_correctness_reward_func/mean": 0.125,
235
- "rewards/final_correctness_reward_func/std": 0.3535533845424652,
236
- "rewards/question_recreation_reward_func/mean": 0.09619322419166565,
237
- "rewards/question_recreation_reward_func/std": 0.05533384159207344,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
  "rewards/strict_format_reward_func/mean": 0.0,
241
  "rewards/strict_format_reward_func/std": 0.0,
242
- "rewards/xmlcount_reward_func/mean": 0.06887500174343586,
243
- "rewards/xmlcount_reward_func/std": 0.11247996985912323,
244
  "step": 12
245
  },
246
  {
@@ -250,20 +250,20 @@
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
- "completions/max_length": 333.5,
254
- "completions/max_terminated_length": 333.5,
255
- "completions/mean_length": 98.33333206176758,
256
- "completions/mean_terminated_length": 98.33333206176758,
257
- "completions/min_length": 4.5,
258
- "completions/min_terminated_length": 4.5,
259
- "epoch": 13.666666666666666,
260
- "grad_norm": 15.76174259185791,
261
- "kl": 0.0010197372721449938,
262
  "learning_rate": 1.4957614383675767e-07,
263
- "loss": 0.1083,
264
- "num_tokens": 40602.0,
265
- "reward": 0.11224918067455292,
266
- "reward_std": 0.035229440312832594,
267
  "rewards/concensus_correctness_reward_func/mean": 0.0,
268
  "rewards/concensus_correctness_reward_func/std": 0.0,
269
  "rewards/consensus_reward_func/mean": 0.0,
@@ -272,14 +272,14 @@
272
  "rewards/cumulative_reward_2/std": 0.0,
273
  "rewards/final_correctness_reward_func/mean": 0.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
- "rewards/question_recreation_reward_func/mean": 0.09249918535351753,
276
- "rewards/question_recreation_reward_func/std": 0.02914312807843089,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
  "rewards/strict_format_reward_func/mean": 0.0,
280
  "rewards/strict_format_reward_func/std": 0.0,
281
- "rewards/xmlcount_reward_func/mean": 0.019749999977648258,
282
- "rewards/xmlcount_reward_func/std": 0.08688061870634556,
283
  "step": 14
284
  },
285
  {
@@ -289,36 +289,36 @@
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
  "completions/clipped_ratio": 0.0,
292
- "completions/max_length": 240.0,
293
- "completions/max_terminated_length": 240.0,
294
- "completions/mean_length": 68.41666507720947,
295
- "completions/mean_terminated_length": 68.41666507720947,
296
- "completions/min_length": 4.5,
297
- "completions/min_terminated_length": 4.5,
298
- "epoch": 15.666666666666666,
299
- "grad_norm": 37.723182678222656,
300
- "kl": 0.0013724988639296498,
301
  "learning_rate": 8.067960709356478e-08,
302
- "loss": -0.1065,
303
- "num_tokens": 45868.0,
304
- "reward": 0.1889171525835991,
305
- "reward_std": 0.28485801815986633,
306
  "rewards/concensus_correctness_reward_func/mean": 0.0,
307
  "rewards/concensus_correctness_reward_func/std": 0.0,
308
- "rewards/consensus_reward_func/mean": 0.0,
309
- "rewards/consensus_reward_func/std": 0.0,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
- "rewards/final_correctness_reward_func/mean": 0.125,
313
- "rewards/final_correctness_reward_func/std": 0.3535533845424652,
314
- "rewards/question_recreation_reward_func/mean": 0.01604215893894434,
315
- "rewards/question_recreation_reward_func/std": 0.010827061953023076,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
  "rewards/strict_format_reward_func/mean": 0.0,
319
  "rewards/strict_format_reward_func/std": 0.0,
320
- "rewards/xmlcount_reward_func/mean": 0.04787500109523535,
321
- "rewards/xmlcount_reward_func/std": 0.13023310527205467,
322
  "step": 16
323
  },
324
  {
@@ -328,36 +328,36 @@
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
  "completions/clipped_ratio": 0.0,
331
- "completions/max_length": 294.5,
332
- "completions/max_terminated_length": 294.5,
333
- "completions/mean_length": 94.5,
334
- "completions/mean_terminated_length": 94.5,
335
- "completions/min_length": 4.0,
336
- "completions/min_terminated_length": 4.0,
337
- "epoch": 17.666666666666668,
338
- "grad_norm": 10.869548797607422,
339
- "kl": 0.0006961950239201542,
340
  "learning_rate": 3.013156219837776e-08,
341
- "loss": 0.2133,
342
- "num_tokens": 52010.0,
343
- "reward": 0.15816733241081238,
344
- "reward_std": 0.04277936555445194,
345
  "rewards/concensus_correctness_reward_func/mean": 0.0,
346
  "rewards/concensus_correctness_reward_func/std": 0.0,
347
- "rewards/consensus_reward_func/mean": 0.0,
348
- "rewards/consensus_reward_func/std": 0.0,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
  "rewards/final_correctness_reward_func/mean": 0.0,
352
  "rewards/final_correctness_reward_func/std": 0.0,
353
- "rewards/question_recreation_reward_func/mean": 0.12766733393073082,
354
- "rewards/question_recreation_reward_func/std": 0.053603312000632286,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
  "rewards/strict_format_reward_func/mean": 0.0,
358
  "rewards/strict_format_reward_func/std": 0.0,
359
- "rewards/xmlcount_reward_func/mean": 0.030500000342726707,
360
- "rewards/xmlcount_reward_func/std": 0.05647502467036247,
361
  "step": 18
362
  },
363
  {
@@ -367,52 +367,52 @@
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
  "completions/clipped_ratio": 0.0,
370
- "completions/max_length": 366.0,
371
- "completions/max_terminated_length": 366.0,
372
- "completions/mean_length": 85.33333587646484,
373
- "completions/mean_terminated_length": 85.33333587646484,
374
- "completions/min_length": 4.5,
375
- "completions/min_terminated_length": 4.5,
376
- "epoch": 19.666666666666668,
377
- "grad_norm": 27.261966705322266,
378
- "kl": 0.0019625511777121574,
379
  "learning_rate": 3.4096741493194193e-09,
380
- "loss": -0.012,
381
- "num_tokens": 57319.0,
382
- "reward": 0.21800990775227547,
383
- "reward_std": 0.24578243133146316,
384
- "rewards/concensus_correctness_reward_func/mean": 0.04774999991059303,
385
- "rewards/concensus_correctness_reward_func/std": 0.13505738973617554,
386
- "rewards/consensus_reward_func/mean": 0.125,
387
- "rewards/consensus_reward_func/std": 0.3535533845424652,
388
  "rewards/cumulative_reward_2/mean": 0.0,
389
  "rewards/cumulative_reward_2/std": 0.0,
390
- "rewards/final_correctness_reward_func/mean": 0.0,
391
- "rewards/final_correctness_reward_func/std": 0.0,
392
- "rewards/question_recreation_reward_func/mean": 0.014759915880858898,
393
- "rewards/question_recreation_reward_func/std": 0.005782891297712922,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
  "rewards/strict_format_reward_func/mean": 0.0,
397
  "rewards/strict_format_reward_func/std": 0.0,
398
- "rewards/xmlcount_reward_func/mean": 0.030500000342726707,
399
- "rewards/xmlcount_reward_func/std": 0.05647502467036247,
400
  "step": 20
401
  },
402
  {
403
- "epoch": 19.666666666666668,
404
  "step": 20,
405
  "total_flos": 0.0,
406
- "train_loss": 0.09253680892288685,
407
- "train_runtime": 133428.0905,
408
  "train_samples_per_second": 0.001,
409
  "train_steps_per_second": 0.0
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
- "num_input_tokens_seen": 57319,
415
- "num_train_epochs": 20,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.9411764705882355,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
 
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.0625,
19
+ "completions/max_length": 628.0,
20
+ "completions/max_terminated_length": 293.0,
21
+ "completions/mean_length": 176.4375,
22
+ "completions/mean_terminated_length": 123.19642639160156,
23
+ "completions/min_length": 8.0,
24
+ "completions/min_terminated_length": 8.0,
25
+ "epoch": 0.47058823529411764,
26
+ "grad_norm": 51.56406784057617,
27
  "kl": 0.0,
28
  "learning_rate": 5e-07,
29
+ "loss": 0.3766,
30
+ "num_tokens": 6919.0,
31
+ "reward": 0.3104562256485224,
32
+ "reward_std": 0.32131396047770977,
33
+ "rewards/concensus_correctness_reward_func/mean": 0.03125,
34
+ "rewards/concensus_correctness_reward_func/std": 0.0883883461356163,
35
+ "rewards/consensus_reward_func/mean": 0.25,
36
+ "rewards/consensus_reward_func/std": 0.46291008591651917,
37
  "rewards/cumulative_reward_2/mean": 0.0,
38
  "rewards/cumulative_reward_2/std": 0.0,
39
  "rewards/final_correctness_reward_func/mean": 0.0,
40
  "rewards/final_correctness_reward_func/std": 0.0,
41
+ "rewards/question_recreation_reward_func/mean": 0.02920621819794178,
42
+ "rewards/question_recreation_reward_func/std": 0.023029986768960953,
43
  "rewards/soft_format_reward_func/mean": 0.0,
44
  "rewards/soft_format_reward_func/std": 0.0,
45
  "rewards/strict_format_reward_func/mean": 0.0,
46
  "rewards/strict_format_reward_func/std": 0.0,
47
+ "rewards/xmlcount_reward_func/mean": 0.0,
48
+ "rewards/xmlcount_reward_func/std": 0.0,
49
  "step": 2
50
  },
51
  {
 
55
  "clip_ratio/low_min": 0.0,
56
  "clip_ratio/region_mean": 0.0,
57
  "completions/clipped_ratio": 0.0,
58
+ "completions/max_length": 419.5,
59
+ "completions/max_terminated_length": 419.5,
60
+ "completions/mean_length": 305.375,
61
+ "completions/mean_terminated_length": 305.375,
62
+ "completions/min_length": 193.5,
63
+ "completions/min_terminated_length": 193.5,
64
+ "epoch": 0.9411764705882353,
65
+ "grad_norm": 11.458623886108398,
66
+ "kl": 0.0007662410207558423,
67
  "learning_rate": 4.864543104251586e-07,
68
+ "loss": 0.0561,
69
+ "num_tokens": 14324.0,
70
+ "reward": 0.023023009300231934,
71
+ "reward_std": 0.01096764812245965,
72
  "rewards/concensus_correctness_reward_func/mean": 0.0,
73
  "rewards/concensus_correctness_reward_func/std": 0.0,
74
+ "rewards/consensus_reward_func/mean": 0.0,
75
+ "rewards/consensus_reward_func/std": 0.0,
76
  "rewards/cumulative_reward_2/mean": 0.0,
77
  "rewards/cumulative_reward_2/std": 0.0,
78
  "rewards/final_correctness_reward_func/mean": 0.0,
79
  "rewards/final_correctness_reward_func/std": 0.0,
80
+ "rewards/question_recreation_reward_func/mean": 0.023023009300231934,
81
+ "rewards/question_recreation_reward_func/std": 0.015222020447254181,
82
  "rewards/soft_format_reward_func/mean": 0.0,
83
  "rewards/soft_format_reward_func/std": 0.0,
84
  "rewards/strict_format_reward_func/mean": 0.0,
85
  "rewards/strict_format_reward_func/std": 0.0,
86
+ "rewards/xmlcount_reward_func/mean": 0.0,
87
+ "rewards/xmlcount_reward_func/std": 0.0,
88
  "step": 4
89
  },
90
  {
 
94
  "clip_ratio/low_min": 0.0,
95
  "clip_ratio/region_mean": 0.0,
96
  "completions/clipped_ratio": 0.0,
97
+ "completions/max_length": 367.0,
98
+ "completions/max_terminated_length": 367.0,
99
+ "completions/mean_length": 114.125,
100
+ "completions/mean_terminated_length": 114.125,
101
+ "completions/min_length": 8.0,
102
+ "completions/min_terminated_length": 8.0,
103
+ "epoch": 1.4705882352941178,
104
+ "grad_norm": 44.851749420166016,
105
+ "kl": 0.0016121815278893337,
106
  "learning_rate": 4.472851273490984e-07,
107
+ "loss": -0.0794,
108
+ "num_tokens": 20246.0,
109
+ "reward": 0.17371663451194763,
110
+ "reward_std": 0.0570271871984005,
111
  "rewards/concensus_correctness_reward_func/mean": 0.0,
112
  "rewards/concensus_correctness_reward_func/std": 0.0,
113
  "rewards/consensus_reward_func/mean": 0.0,
 
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
+ "rewards/question_recreation_reward_func/mean": 0.17371663451194763,
120
+ "rewards/question_recreation_reward_func/std": 0.06652013398706913,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
124
  "rewards/strict_format_reward_func/std": 0.0,
125
+ "rewards/xmlcount_reward_func/mean": 0.0,
126
+ "rewards/xmlcount_reward_func/std": 0.0,
127
  "step": 6
128
  },
129
  {
 
133
  "clip_ratio/low_min": 0.0,
134
  "clip_ratio/region_mean": 0.0,
135
  "completions/clipped_ratio": 0.0,
136
+ "completions/max_length": 305.0,
137
+ "completions/max_terminated_length": 305.0,
138
+ "completions/mean_length": 152.5625,
139
+ "completions/mean_terminated_length": 152.5625,
140
+ "completions/min_length": 42.0,
141
+ "completions/min_terminated_length": 42.0,
142
+ "epoch": 1.9411764705882353,
143
+ "grad_norm": 9.940235137939453,
144
+ "kl": 0.000744940982258413,
145
  "learning_rate": 3.867370395306068e-07,
146
+ "loss": 0.0534,
147
+ "num_tokens": 27557.0,
148
+ "reward": 0.02366686426103115,
149
+ "reward_std": 0.008607423398643732,
150
  "rewards/concensus_correctness_reward_func/mean": 0.0,
151
  "rewards/concensus_correctness_reward_func/std": 0.0,
152
+ "rewards/consensus_reward_func/mean": 0.0,
153
+ "rewards/consensus_reward_func/std": 0.0,
154
  "rewards/cumulative_reward_2/mean": 0.0,
155
  "rewards/cumulative_reward_2/std": 0.0,
156
  "rewards/final_correctness_reward_func/mean": 0.0,
157
  "rewards/final_correctness_reward_func/std": 0.0,
158
+ "rewards/question_recreation_reward_func/mean": 0.023666863329708576,
159
+ "rewards/question_recreation_reward_func/std": 0.017653593327850103,
160
  "rewards/soft_format_reward_func/mean": 0.0,
161
  "rewards/soft_format_reward_func/std": 0.0,
162
  "rewards/strict_format_reward_func/mean": 0.0,
163
  "rewards/strict_format_reward_func/std": 0.0,
164
+ "rewards/xmlcount_reward_func/mean": 0.0,
165
+ "rewards/xmlcount_reward_func/std": 0.0,
166
  "step": 8
167
  },
168
  {
 
172
  "clip_ratio/low_min": 0.0,
173
  "clip_ratio/region_mean": 0.0,
174
  "completions/clipped_ratio": 0.0,
175
+ "completions/max_length": 509.0,
176
+ "completions/max_terminated_length": 509.0,
177
+ "completions/mean_length": 250.5,
178
+ "completions/mean_terminated_length": 250.5,
179
+ "completions/min_length": 64.5,
180
+ "completions/min_terminated_length": 64.5,
181
+ "epoch": 2.4705882352941178,
182
+ "grad_norm": 5.178226470947266,
183
+ "kl": 0.0007074162567732856,
184
  "learning_rate": 3.1137137178519977e-07,
185
+ "loss": 0.199,
186
+ "num_tokens": 35661.0,
187
+ "reward": 0.08728898875415325,
188
+ "reward_std": 0.025852903723716736,
189
  "rewards/concensus_correctness_reward_func/mean": 0.0,
190
  "rewards/concensus_correctness_reward_func/std": 0.0,
191
  "rewards/consensus_reward_func/mean": 0.0,
 
194
  "rewards/cumulative_reward_2/std": 0.0,
195
  "rewards/final_correctness_reward_func/mean": 0.0,
196
  "rewards/final_correctness_reward_func/std": 0.0,
197
+ "rewards/question_recreation_reward_func/mean": 0.08728898782283068,
198
+ "rewards/question_recreation_reward_func/std": 0.03699528565630317,
199
  "rewards/soft_format_reward_func/mean": 0.0,
200
  "rewards/soft_format_reward_func/std": 0.0,
201
  "rewards/strict_format_reward_func/mean": 0.0,
202
  "rewards/strict_format_reward_func/std": 0.0,
203
+ "rewards/xmlcount_reward_func/mean": 0.0,
204
+ "rewards/xmlcount_reward_func/std": 0.0,
205
  "step": 10
206
  },
207
  {
 
211
  "clip_ratio/low_min": 0.0,
212
  "clip_ratio/region_mean": 0.0,
213
  "completions/clipped_ratio": 0.0,
214
+ "completions/max_length": 304.5,
215
+ "completions/max_terminated_length": 304.5,
216
+ "completions/mean_length": 166.875,
217
+ "completions/mean_terminated_length": 166.875,
218
+ "completions/min_length": 82.5,
219
+ "completions/min_terminated_length": 82.5,
220
+ "epoch": 2.9411764705882355,
221
+ "grad_norm": 9.689749717712402,
222
+ "kl": 0.0008264059497378184,
223
  "learning_rate": 2.2935516363191693e-07,
224
+ "loss": 0.2115,
225
+ "num_tokens": 42270.0,
226
+ "reward": 0.20029596518725157,
227
+ "reward_std": 0.19888783944770694,
228
  "rewards/concensus_correctness_reward_func/mean": 0.0,
229
  "rewards/concensus_correctness_reward_func/std": 0.0,
230
+ "rewards/consensus_reward_func/mean": 0.125,
231
+ "rewards/consensus_reward_func/std": 0.3535533845424652,
232
  "rewards/cumulative_reward_2/mean": 0.0,
233
  "rewards/cumulative_reward_2/std": 0.0,
234
+ "rewards/final_correctness_reward_func/mean": 0.0,
235
+ "rewards/final_correctness_reward_func/std": 0.0,
236
+ "rewards/question_recreation_reward_func/mean": 0.07529596518725157,
237
+ "rewards/question_recreation_reward_func/std": 0.03689309814944863,
238
  "rewards/soft_format_reward_func/mean": 0.0,
239
  "rewards/soft_format_reward_func/std": 0.0,
240
  "rewards/strict_format_reward_func/mean": 0.0,
241
  "rewards/strict_format_reward_func/std": 0.0,
242
+ "rewards/xmlcount_reward_func/mean": 0.0,
243
+ "rewards/xmlcount_reward_func/std": 0.0,
244
  "step": 12
245
  },
246
  {
 
250
  "clip_ratio/low_min": 0.0,
251
  "clip_ratio/region_mean": 0.0,
252
  "completions/clipped_ratio": 0.0,
253
+ "completions/max_length": 426.5,
254
+ "completions/max_terminated_length": 426.5,
255
+ "completions/mean_length": 159.5,
256
+ "completions/mean_terminated_length": 159.5,
257
+ "completions/min_length": 2.0,
258
+ "completions/min_terminated_length": 2.0,
259
+ "epoch": 3.4705882352941178,
260
+ "grad_norm": 8.103471755981445,
261
+ "kl": 0.0017809901619330049,
262
  "learning_rate": 1.4957614383675767e-07,
263
+ "loss": 0.0366,
264
+ "num_tokens": 48918.0,
265
+ "reward": 0.01776011288166046,
266
+ "reward_std": 0.006043280474841595,
267
  "rewards/concensus_correctness_reward_func/mean": 0.0,
268
  "rewards/concensus_correctness_reward_func/std": 0.0,
269
  "rewards/consensus_reward_func/mean": 0.0,
 
272
  "rewards/cumulative_reward_2/std": 0.0,
273
  "rewards/final_correctness_reward_func/mean": 0.0,
274
  "rewards/final_correctness_reward_func/std": 0.0,
275
+ "rewards/question_recreation_reward_func/mean": 0.01776011334732175,
276
+ "rewards/question_recreation_reward_func/std": 0.017151668202131987,
277
  "rewards/soft_format_reward_func/mean": 0.0,
278
  "rewards/soft_format_reward_func/std": 0.0,
279
  "rewards/strict_format_reward_func/mean": 0.0,
280
  "rewards/strict_format_reward_func/std": 0.0,
281
+ "rewards/xmlcount_reward_func/mean": 0.0,
282
+ "rewards/xmlcount_reward_func/std": 0.0,
283
  "step": 14
284
  },
285
  {
 
289
  "clip_ratio/low_min": 0.0,
290
  "clip_ratio/region_mean": 0.0,
291
  "completions/clipped_ratio": 0.0,
292
+ "completions/max_length": 217.5,
293
+ "completions/max_terminated_length": 217.5,
294
+ "completions/mean_length": 112.6875,
295
+ "completions/mean_terminated_length": 112.6875,
296
+ "completions/min_length": 52.5,
297
+ "completions/min_terminated_length": 52.5,
298
+ "epoch": 3.9411764705882355,
299
+ "grad_norm": 8.5344877243042,
300
+ "kl": 0.0009168514661723748,
301
  "learning_rate": 8.067960709356478e-08,
302
+ "loss": 0.2618,
303
+ "num_tokens": 55559.0,
304
+ "reward": 0.14910611137747765,
305
+ "reward_std": 0.18657799949869514,
306
  "rewards/concensus_correctness_reward_func/mean": 0.0,
307
  "rewards/concensus_correctness_reward_func/std": 0.0,
308
+ "rewards/consensus_reward_func/mean": 0.125,
309
+ "rewards/consensus_reward_func/std": 0.3535533845424652,
310
  "rewards/cumulative_reward_2/mean": 0.0,
311
  "rewards/cumulative_reward_2/std": 0.0,
312
+ "rewards/final_correctness_reward_func/mean": 0.0,
313
+ "rewards/final_correctness_reward_func/std": 0.0,
314
+ "rewards/question_recreation_reward_func/mean": 0.02410610392689705,
315
+ "rewards/question_recreation_reward_func/std": 0.02181409765034914,
316
  "rewards/soft_format_reward_func/mean": 0.0,
317
  "rewards/soft_format_reward_func/std": 0.0,
318
  "rewards/strict_format_reward_func/mean": 0.0,
319
  "rewards/strict_format_reward_func/std": 0.0,
320
+ "rewards/xmlcount_reward_func/mean": 0.0,
321
+ "rewards/xmlcount_reward_func/std": 0.0,
322
  "step": 16
323
  },
324
  {
 
328
  "clip_ratio/low_min": 0.0,
329
  "clip_ratio/region_mean": 0.0,
330
  "completions/clipped_ratio": 0.0,
331
+ "completions/max_length": 281.5,
332
+ "completions/max_terminated_length": 281.5,
333
+ "completions/mean_length": 118.0,
334
+ "completions/mean_terminated_length": 118.0,
335
+ "completions/min_length": 18.5,
336
+ "completions/min_terminated_length": 18.5,
337
+ "epoch": 4.470588235294118,
338
+ "grad_norm": 19.964595794677734,
339
+ "kl": 0.0011214540027140174,
340
  "learning_rate": 3.013156219837776e-08,
341
+ "loss": 0.2282,
342
+ "num_tokens": 61543.0,
343
+ "reward": 0.32675740122795105,
344
+ "reward_std": 0.044070890406146646,
345
  "rewards/concensus_correctness_reward_func/mean": 0.0,
346
  "rewards/concensus_correctness_reward_func/std": 0.0,
347
+ "rewards/consensus_reward_func/mean": 0.25,
348
+ "rewards/consensus_reward_func/std": 0.46291008591651917,
349
  "rewards/cumulative_reward_2/mean": 0.0,
350
  "rewards/cumulative_reward_2/std": 0.0,
351
  "rewards/final_correctness_reward_func/mean": 0.0,
352
  "rewards/final_correctness_reward_func/std": 0.0,
353
+ "rewards/question_recreation_reward_func/mean": 0.07675742218270898,
354
+ "rewards/question_recreation_reward_func/std": 0.03963730251416564,
355
  "rewards/soft_format_reward_func/mean": 0.0,
356
  "rewards/soft_format_reward_func/std": 0.0,
357
  "rewards/strict_format_reward_func/mean": 0.0,
358
  "rewards/strict_format_reward_func/std": 0.0,
359
+ "rewards/xmlcount_reward_func/mean": 0.0,
360
+ "rewards/xmlcount_reward_func/std": 0.0,
361
  "step": 18
362
  },
363
  {
 
367
  "clip_ratio/low_min": 0.0,
368
  "clip_ratio/region_mean": 0.0,
369
  "completions/clipped_ratio": 0.0,
370
+ "completions/max_length": 221.0,
371
+ "completions/max_terminated_length": 221.0,
372
+ "completions/mean_length": 119.3125,
373
+ "completions/mean_terminated_length": 119.3125,
374
+ "completions/min_length": 62.5,
375
+ "completions/min_terminated_length": 62.5,
376
+ "epoch": 4.9411764705882355,
377
+ "grad_norm": 14.930213928222656,
378
+ "kl": 0.0009697647255961783,
379
  "learning_rate": 3.4096741493194193e-09,
380
+ "loss": -0.0206,
381
+ "num_tokens": 67694.0,
382
+ "reward": 0.2407371997833252,
383
+ "reward_std": 0.20068720169365406,
384
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
385
+ "rewards/concensus_correctness_reward_func/std": 0.0,
386
+ "rewards/consensus_reward_func/mean": 0.0,
387
+ "rewards/consensus_reward_func/std": 0.0,
388
  "rewards/cumulative_reward_2/mean": 0.0,
389
  "rewards/cumulative_reward_2/std": 0.0,
390
+ "rewards/final_correctness_reward_func/mean": 0.125,
391
+ "rewards/final_correctness_reward_func/std": 0.3535533845424652,
392
+ "rewards/question_recreation_reward_func/mean": 0.11573721468448639,
393
+ "rewards/question_recreation_reward_func/std": 0.07402005046606064,
394
  "rewards/soft_format_reward_func/mean": 0.0,
395
  "rewards/soft_format_reward_func/std": 0.0,
396
  "rewards/strict_format_reward_func/mean": 0.0,
397
  "rewards/strict_format_reward_func/std": 0.0,
398
+ "rewards/xmlcount_reward_func/mean": 0.0,
399
+ "rewards/xmlcount_reward_func/std": 0.0,
400
  "step": 20
401
  },
402
  {
403
+ "epoch": 4.9411764705882355,
404
  "step": 20,
405
  "total_flos": 0.0,
406
+ "train_loss": 0.13233287669718266,
407
+ "train_runtime": 164161.8368,
408
  "train_samples_per_second": 0.001,
409
  "train_steps_per_second": 0.0
410
  }
411
  ],
412
  "logging_steps": 2,
413
  "max_steps": 20,
414
+ "num_input_tokens_seen": 67694,
415
+ "num_train_epochs": 5,
416
  "save_steps": 25,
417
  "stateful_callbacks": {
418
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:620dcea1f14346a2fbd9e648d69f599772d688654d6de6ac407a35553b204b96
3
  size 6929
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e2e3bc5392965d5dafa468948526080a63b0daeacc65feb1086c19428f6ca9d
3
  size 6929