JW17's picture
Add files using upload-large-folder tool
7c4ef46 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2326934264107039,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 489.01953125,
"epoch": 0.0011634671320535194,
"grad_norm": 0.4066821416431487,
"kl": 0.0004706382751464844,
"learning_rate": 3.846153846153846e-08,
"loss": 0.0,
"reward": 0.107421875,
"reward_std": 0.14256841503083706,
"rewards/correctness_reward_func": 0.1015625,
"rewards/strict_format_reward_func": 0.005859375,
"step": 1
},
{
"completion_length": 491.90234375,
"epoch": 0.002326934264107039,
"grad_norm": 1.2204081989870978,
"kl": 0.0008008480072021484,
"learning_rate": 7.692307692307692e-08,
"loss": 0.0,
"reward": 0.12109375,
"reward_std": 0.22897969186306,
"rewards/correctness_reward_func": 0.1015625,
"rewards/strict_format_reward_func": 0.01953125,
"step": 2
},
{
"completion_length": 489.51953125,
"epoch": 0.0034904013961605585,
"grad_norm": 9.972307004201902,
"kl": 0.004039764404296875,
"learning_rate": 1.1538461538461539e-07,
"loss": 0.0002,
"reward": 0.064453125,
"reward_std": 0.125604297965765,
"rewards/correctness_reward_func": 0.0546875,
"rewards/strict_format_reward_func": 0.009765625,
"step": 3
},
{
"completion_length": 506.64453125,
"epoch": 0.004653868528214078,
"grad_norm": 1.442368000970251,
"kl": 0.009157180786132812,
"learning_rate": 1.5384615384615385e-07,
"loss": 0.0004,
"reward": 0.09765625,
"reward_std": 0.17736226692795753,
"rewards/correctness_reward_func": 0.0859375,
"rewards/strict_format_reward_func": 0.01171875,
"step": 4
},
{
"completion_length": 494.796875,
"epoch": 0.005817335660267597,
"grad_norm": 2.215173709920235,
"kl": 0.004576206207275391,
"learning_rate": 1.9230769230769231e-07,
"loss": 0.0002,
"reward": 0.09375,
"reward_std": 0.1610843911767006,
"rewards/correctness_reward_func": 0.0859375,
"rewards/strict_format_reward_func": 0.0078125,
"step": 5
},
{
"completion_length": 500.6875,
"epoch": 0.006980802792321117,
"grad_norm": 1.5436419526429799,
"kl": 0.0007085800170898438,
"learning_rate": 2.3076923076923078e-07,
"loss": 0.0,
"reward": 0.15234375,
"reward_std": 0.24702188372612,
"rewards/correctness_reward_func": 0.140625,
"rewards/strict_format_reward_func": 0.01171875,
"step": 6
},
{
"completion_length": 502.09375,
"epoch": 0.008144269924374637,
"grad_norm": 16.600174739059614,
"kl": 0.1050872802734375,
"learning_rate": 2.692307692307692e-07,
"loss": 0.0042,
"reward": 0.126953125,
"reward_std": 0.1962406411767006,
"rewards/correctness_reward_func": 0.1171875,
"rewards/strict_format_reward_func": 0.009765625,
"step": 7
},
{
"completion_length": 574.51171875,
"epoch": 0.009307737056428156,
"grad_norm": 151.48079010972526,
"kl": 0.18497467041015625,
"learning_rate": 3.076923076923077e-07,
"loss": 0.0074,
"reward": 0.115234375,
"reward_std": 0.1714012213051319,
"rewards/correctness_reward_func": 0.1015625,
"rewards/strict_format_reward_func": 0.013671875,
"step": 8
},
{
"completion_length": 456.30859375,
"epoch": 0.010471204188481676,
"grad_norm": 3.068318566100937,
"kl": 0.007595062255859375,
"learning_rate": 3.461538461538461e-07,
"loss": 0.0003,
"reward": 0.12890625,
"reward_std": 0.20751213282346725,
"rewards/correctness_reward_func": 0.1171875,
"rewards/strict_format_reward_func": 0.01171875,
"step": 9
},
{
"completion_length": 467.6171875,
"epoch": 0.011634671320535195,
"grad_norm": 5.54045688921292,
"kl": 0.02019977569580078,
"learning_rate": 3.8461538461538463e-07,
"loss": 0.0008,
"reward": 0.111328125,
"reward_std": 0.2094484455883503,
"rewards/correctness_reward_func": 0.1015625,
"rewards/strict_format_reward_func": 0.009765625,
"step": 10
},
{
"completion_length": 522.4609375,
"epoch": 0.012798138452588714,
"grad_norm": 1.6672527087542282,
"kl": 0.007293701171875,
"learning_rate": 4.2307692307692304e-07,
"loss": 0.0003,
"reward": 0.09375,
"reward_std": 0.14160171803086996,
"rewards/correctness_reward_func": 0.078125,
"rewards/strict_format_reward_func": 0.015625,
"step": 11
},
{
"completion_length": 554.41015625,
"epoch": 0.013961605584642234,
"grad_norm": 15.713162541614723,
"kl": 0.022916793823242188,
"learning_rate": 4.6153846153846156e-07,
"loss": 0.0009,
"reward": 0.05859375,
"reward_std": 0.09407384321093559,
"rewards/correctness_reward_func": 0.046875,
"rewards/strict_format_reward_func": 0.01171875,
"step": 12
},
{
"completion_length": 472.33203125,
"epoch": 0.015125072716695753,
"grad_norm": 9.999083570023721,
"kl": 0.00162506103515625,
"learning_rate": 5e-07,
"loss": 0.0001,
"reward": 0.099609375,
"reward_std": 0.16796875,
"rewards/correctness_reward_func": 0.09375,
"rewards/strict_format_reward_func": 0.005859375,
"step": 13
},
{
"completion_length": 471.0390625,
"epoch": 0.016288539848749273,
"grad_norm": 14.601550530650117,
"kl": 0.018660545349121094,
"learning_rate": 5.384615384615384e-07,
"loss": 0.0007,
"reward": 0.142578125,
"reward_std": 0.2406984455883503,
"rewards/correctness_reward_func": 0.1328125,
"rewards/strict_format_reward_func": 0.009765625,
"step": 14
},
{
"completion_length": 513.76953125,
"epoch": 0.017452006980802792,
"grad_norm": 3.897589890240525,
"kl": 0.0014133453369140625,
"learning_rate": 5.769230769230768e-07,
"loss": 0.0001,
"reward": 0.078125,
"reward_std": 0.13499781489372253,
"rewards/correctness_reward_func": 0.0703125,
"rewards/strict_format_reward_func": 0.0078125,
"step": 15
},
{
"completion_length": 461.609375,
"epoch": 0.01861547411285631,
"grad_norm": 0.2331700635519536,
"kl": 0.0008556842803955078,
"learning_rate": 6.153846153846154e-07,
"loss": 0.0,
"reward": 0.080078125,
"reward_std": 0.13233871944248676,
"rewards/correctness_reward_func": 0.0703125,
"rewards/strict_format_reward_func": 0.009765625,
"step": 16
},
{
"completion_length": 493.96484375,
"epoch": 0.01977894124490983,
"grad_norm": 0.6332584588781173,
"kl": 0.0008721351623535156,
"learning_rate": 6.538461538461538e-07,
"loss": 0.0,
"reward": 0.109375,
"reward_std": 0.19749781489372253,
"rewards/correctness_reward_func": 0.09375,
"rewards/strict_format_reward_func": 0.015625,
"step": 17
},
{
"completion_length": 499.87890625,
"epoch": 0.020942408376963352,
"grad_norm": 6.416190324649871,
"kl": 0.0064716339111328125,
"learning_rate": 6.923076923076922e-07,
"loss": 0.0003,
"reward": 0.0859375,
"reward_std": 0.1454593911767006,
"rewards/correctness_reward_func": 0.078125,
"rewards/strict_format_reward_func": 0.0078125,
"step": 18
},
{
"completion_length": 560.52734375,
"epoch": 0.02210587550901687,
"grad_norm": 0.18792971982824808,
"kl": 0.0007505416870117188,
"learning_rate": 7.307692307692307e-07,
"loss": 0.0,
"reward": 0.083984375,
"reward_std": 0.16796875,
"rewards/correctness_reward_func": 0.078125,
"rewards/strict_format_reward_func": 0.005859375,
"step": 19
},
{
"completion_length": 475.859375,
"epoch": 0.02326934264107039,
"grad_norm": 57.89397166781348,
"kl": 0.42646121978759766,
"learning_rate": 7.692307692307693e-07,
"loss": 0.0171,
"reward": 0.119140625,
"reward_std": 0.17447129637002945,
"rewards/correctness_reward_func": 0.1015625,
"rewards/strict_format_reward_func": 0.017578125,
"step": 20
},
{
"completion_length": 490.015625,
"epoch": 0.02443280977312391,
"grad_norm": 8.434807442056037,
"kl": 0.023256301879882812,
"learning_rate": 8.076923076923077e-07,
"loss": 0.0009,
"reward": 0.1171875,
"reward_std": 0.19881487637758255,
"rewards/correctness_reward_func": 0.09375,
"rewards/strict_format_reward_func": 0.0234375,
"step": 21
},
{
"completion_length": 506.33984375,
"epoch": 0.025596276905177427,
"grad_norm": 1.1653419639417089,
"kl": 0.00223541259765625,
"learning_rate": 8.461538461538461e-07,
"loss": 0.0001,
"reward": 0.10546875,
"reward_std": 0.18824483826756477,
"rewards/correctness_reward_func": 0.09375,
"rewards/strict_format_reward_func": 0.01171875,
"step": 22
},
{
"completion_length": 511.375,
"epoch": 0.02675974403723095,
"grad_norm": 0.41128365761978974,
"kl": 0.0008344650268554688,
"learning_rate": 8.846153846153846e-07,
"loss": 0.0,
"reward": 0.150390625,
"reward_std": 0.25167298316955566,
"rewards/correctness_reward_func": 0.1328125,
"rewards/strict_format_reward_func": 0.017578125,
"step": 23
},
{
"completion_length": 528.53125,
"epoch": 0.027923211169284468,
"grad_norm": 1.4081194117907943,
"kl": 0.0036749839782714844,
"learning_rate": 9.230769230769231e-07,
"loss": 0.0001,
"reward": 0.09765625,
"reward_std": 0.16415445879101753,
"rewards/correctness_reward_func": 0.09375,
"rewards/strict_format_reward_func": 0.00390625,
"step": 24
},
{
"completion_length": 566.2265625,
"epoch": 0.029086678301337987,
"grad_norm": 1.1799989005370515,
"kl": 0.009830474853515625,
"learning_rate": 9.615384615384615e-07,
"loss": 0.0004,
"reward": 0.107421875,
"reward_std": 0.16424159705638885,
"rewards/correctness_reward_func": 0.09375,
"rewards/strict_format_reward_func": 0.013671875,
"step": 25
},
{
"completion_length": 438.703125,
"epoch": 0.030250145433391506,
"grad_norm": 3.104360216151301,
"kl": 0.03290557861328125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 0.15234375,
"reward_std": 0.20628703013062477,
"rewards/correctness_reward_func": 0.140625,
"rewards/strict_format_reward_func": 0.01171875,
"step": 26
},
{
"completion_length": 539.5859375,
"epoch": 0.031413612565445025,
"grad_norm": 0.5338598772393494,
"kl": 0.005832672119140625,
"learning_rate": 9.99996444102478e-07,
"loss": 0.0002,
"reward": 0.1015625,
"reward_std": 0.17196696251630783,
"rewards/correctness_reward_func": 0.0859375,
"rewards/strict_format_reward_func": 0.015625,
"step": 27
},
{
"completion_length": 497.140625,
"epoch": 0.03257707969749855,
"grad_norm": 2.5666979824433844,
"kl": 0.018198013305664062,
"learning_rate": 9.999857764604895e-07,
"loss": 0.0007,
"reward": 0.140625,
"reward_std": 0.21521097421646118,
"rewards/correctness_reward_func": 0.125,
"rewards/strict_format_reward_func": 0.015625,
"step": 28
},
{
"completion_length": 485.59375,
"epoch": 0.03374054682955206,
"grad_norm": 6.4345252573344816,
"kl": 0.005001068115234375,
"learning_rate": 9.999679972257667e-07,
"loss": 0.0002,
"reward": 0.1875,
"reward_std": 0.2648322060704231,
"rewards/correctness_reward_func": 0.1640625,
"rewards/strict_format_reward_func": 0.0234375,
"step": 29
},
{
"completion_length": 416.53125,
"epoch": 0.034904013961605584,
"grad_norm": 2.602336527370174,
"kl": 0.0277099609375,
"learning_rate": 9.999431066511943e-07,
"loss": 0.0011,
"reward": 0.125,
"reward_std": 0.21884196251630783,
"rewards/correctness_reward_func": 0.109375,
"rewards/strict_format_reward_func": 0.015625,
"step": 30
},
{
"completion_length": 485.08203125,
"epoch": 0.03606748109365911,
"grad_norm": 1.5342850964012629,
"kl": 0.014495849609375,
"learning_rate": 9.999111050908056e-07,
"loss": 0.0006,
"reward": 0.197265625,
"reward_std": 0.3104500323534012,
"rewards/correctness_reward_func": 0.1796875,
"rewards/strict_format_reward_func": 0.017578125,
"step": 31
},
{
"completion_length": 531.640625,
"epoch": 0.03723094822571262,
"grad_norm": 20.359434755302512,
"kl": 0.06761932373046875,
"learning_rate": 9.998719929997773e-07,
"loss": 0.0027,
"reward": 0.142578125,
"reward_std": 0.19669455289840698,
"rewards/correctness_reward_func": 0.1171875,
"rewards/strict_format_reward_func": 0.025390625,
"step": 32
},
{
"completion_length": 514.54296875,
"epoch": 0.038394415357766144,
"grad_norm": 0.44887804375426404,
"kl": 0.003627777099609375,
"learning_rate": 9.998257709344243e-07,
"loss": 0.0001,
"reward": 0.173828125,
"reward_std": 0.24332467839121819,
"rewards/correctness_reward_func": 0.171875,
"rewards/strict_format_reward_func": 0.001953125,
"step": 33
},
{
"completion_length": 492.8125,
"epoch": 0.03955788248981966,
"grad_norm": 3.197350305028001,
"kl": 0.007907867431640625,
"learning_rate": 9.997724395521901e-07,
"loss": 0.0003,
"reward": 0.14453125,
"reward_std": 0.24139471352100372,
"rewards/correctness_reward_func": 0.1171875,
"rewards/strict_format_reward_func": 0.02734375,
"step": 34
},
{
"completion_length": 407.99609375,
"epoch": 0.04072134962187318,
"grad_norm": 0.4960254389285239,
"kl": 0.01204681396484375,
"learning_rate": 9.997119996116382e-07,
"loss": 0.0005,
"reward": 0.240234375,
"reward_std": 0.31166573986411095,
"rewards/correctness_reward_func": 0.2109375,
"rewards/strict_format_reward_func": 0.029296875,
"step": 35
},
{
"completion_length": 398.2421875,
"epoch": 0.041884816753926704,
"grad_norm": 9.032088364099023,
"kl": 0.03583526611328125,
"learning_rate": 9.996444519724418e-07,
"loss": 0.0014,
"reward": 0.15625,
"reward_std": 0.23204976320266724,
"rewards/correctness_reward_func": 0.140625,
"rewards/strict_format_reward_func": 0.015625,
"step": 36
},
{
"completion_length": 434.36328125,
"epoch": 0.04304828388598022,
"grad_norm": 0.5171450751911104,
"kl": 0.0065155029296875,
"learning_rate": 9.995697975953707e-07,
"loss": 0.0003,
"reward": 0.193359375,
"reward_std": 0.29260406643152237,
"rewards/correctness_reward_func": 0.15625,
"rewards/strict_format_reward_func": 0.037109375,
"step": 37
},
{
"completion_length": 488.390625,
"epoch": 0.04421175101803374,
"grad_norm": 0.40237380286765173,
"kl": 0.00611114501953125,
"learning_rate": 9.994880375422784e-07,
"loss": 0.0002,
"reward": 0.26171875,
"reward_std": 0.3653857484459877,
"rewards/correctness_reward_func": 0.203125,
"rewards/strict_format_reward_func": 0.05859375,
"step": 38
},
{
"completion_length": 420.58984375,
"epoch": 0.04537521815008726,
"grad_norm": 5.104453757523126,
"kl": 0.01647186279296875,
"learning_rate": 9.99399172976086e-07,
"loss": 0.0007,
"reward": 0.29296875,
"reward_std": 0.37514638155698776,
"rewards/correctness_reward_func": 0.234375,
"rewards/strict_format_reward_func": 0.05859375,
"step": 39
},
{
"completion_length": 394.3828125,
"epoch": 0.04653868528214078,
"grad_norm": 3.4950699311546694,
"kl": 0.009918212890625,
"learning_rate": 9.993032051607668e-07,
"loss": 0.0004,
"reward": 0.267578125,
"reward_std": 0.31589680910110474,
"rewards/correctness_reward_func": 0.2265625,
"rewards/strict_format_reward_func": 0.041015625,
"step": 40
},
{
"completion_length": 413.97265625,
"epoch": 0.0477021524141943,
"grad_norm": 1.061976991134195,
"kl": 0.0106964111328125,
"learning_rate": 9.992001354613277e-07,
"loss": 0.0004,
"reward": 0.21484375,
"reward_std": 0.2792557002976537,
"rewards/correctness_reward_func": 0.171875,
"rewards/strict_format_reward_func": 0.04296875,
"step": 41
},
{
"completion_length": 397.8515625,
"epoch": 0.04886561954624782,
"grad_norm": 0.765028003111525,
"kl": 0.01513671875,
"learning_rate": 9.990899653437901e-07,
"loss": 0.0006,
"reward": 0.388671875,
"reward_std": 0.41248171031475067,
"rewards/correctness_reward_func": 0.3125,
"rewards/strict_format_reward_func": 0.076171875,
"step": 42
},
{
"completion_length": 397.07421875,
"epoch": 0.05002908667830134,
"grad_norm": 12.065798447865369,
"kl": 0.06500244140625,
"learning_rate": 9.989726963751682e-07,
"loss": 0.0026,
"reward": 0.3125,
"reward_std": 0.4369198568165302,
"rewards/correctness_reward_func": 0.234375,
"rewards/strict_format_reward_func": 0.078125,
"step": 43
},
{
"completion_length": 384.0859375,
"epoch": 0.051192553810354854,
"grad_norm": 10.020233386348186,
"kl": 0.1649932861328125,
"learning_rate": 9.988483302234478e-07,
"loss": 0.0066,
"reward": 0.275390625,
"reward_std": 0.3524422347545624,
"rewards/correctness_reward_func": 0.1796875,
"rewards/strict_format_reward_func": 0.095703125,
"step": 44
},
{
"completion_length": 424.24609375,
"epoch": 0.05235602094240838,
"grad_norm": 3809.223926800563,
"kl": 24.387359619140625,
"learning_rate": 9.987168686575623e-07,
"loss": 0.9774,
"reward": 0.26953125,
"reward_std": 0.3424443453550339,
"rewards/correctness_reward_func": 0.1796875,
"rewards/strict_format_reward_func": 0.08984375,
"step": 45
},
{
"completion_length": 365.3359375,
"epoch": 0.0535194880744619,
"grad_norm": 4.815371512573417,
"kl": 0.0827484130859375,
"learning_rate": 9.98578313547367e-07,
"loss": 0.0033,
"reward": 0.39453125,
"reward_std": 0.45618152618408203,
"rewards/correctness_reward_func": 0.2890625,
"rewards/strict_format_reward_func": 0.10546875,
"step": 46
},
{
"completion_length": 410.48828125,
"epoch": 0.054682955206515414,
"grad_norm": 3.1518114378957223,
"kl": 0.03973388671875,
"learning_rate": 9.98432666863613e-07,
"loss": 0.0016,
"reward": 0.365234375,
"reward_std": 0.4011538214981556,
"rewards/correctness_reward_func": 0.234375,
"rewards/strict_format_reward_func": 0.130859375,
"step": 47
},
{
"completion_length": 409.66796875,
"epoch": 0.055846422338568937,
"grad_norm": 0.6202311602466106,
"kl": 0.025848388671875,
"learning_rate": 9.982799306779189e-07,
"loss": 0.001,
"reward": 0.591796875,
"reward_std": 0.5869772136211395,
"rewards/correctness_reward_func": 0.421875,
"rewards/strict_format_reward_func": 0.169921875,
"step": 48
},
{
"completion_length": 375.16015625,
"epoch": 0.05700988947062245,
"grad_norm": 231.68110012914804,
"kl": 0.854034423828125,
"learning_rate": 9.98120107162742e-07,
"loss": 0.0343,
"reward": 0.42578125,
"reward_std": 0.4635503552854061,
"rewards/correctness_reward_func": 0.2421875,
"rewards/strict_format_reward_func": 0.18359375,
"step": 49
},
{
"completion_length": 396.5,
"epoch": 0.058173356602675974,
"grad_norm": 8.054334866966464,
"kl": 0.02935791015625,
"learning_rate": 9.979531985913457e-07,
"loss": 0.0012,
"reward": 0.43359375,
"reward_std": 0.5311293751001358,
"rewards/correctness_reward_func": 0.2421875,
"rewards/strict_format_reward_func": 0.19140625,
"step": 50
},
{
"completion_length": 341.55859375,
"epoch": 0.059336823734729496,
"grad_norm": 39.40136884647299,
"kl": 0.515869140625,
"learning_rate": 9.977792073377697e-07,
"loss": 0.0206,
"reward": 0.50390625,
"reward_std": 0.5089018940925598,
"rewards/correctness_reward_func": 0.3203125,
"rewards/strict_format_reward_func": 0.18359375,
"step": 51
},
{
"completion_length": 375.640625,
"epoch": 0.06050029086678301,
"grad_norm": 20.902673745049487,
"kl": 0.14080810546875,
"learning_rate": 9.975981358767944e-07,
"loss": 0.0056,
"reward": 0.453125,
"reward_std": 0.47038574516773224,
"rewards/correctness_reward_func": 0.2578125,
"rewards/strict_format_reward_func": 0.1953125,
"step": 52
},
{
"completion_length": 351.328125,
"epoch": 0.061663757998836534,
"grad_norm": 2.0754249823099595,
"kl": 0.0267333984375,
"learning_rate": 9.974099867839057e-07,
"loss": 0.0011,
"reward": 0.525390625,
"reward_std": 0.46075718849897385,
"rewards/correctness_reward_func": 0.2890625,
"rewards/strict_format_reward_func": 0.236328125,
"step": 53
},
{
"completion_length": 385.4140625,
"epoch": 0.06282722513089005,
"grad_norm": 335.2509946112828,
"kl": 1.28887939453125,
"learning_rate": 9.972147627352593e-07,
"loss": 0.0513,
"reward": 0.533203125,
"reward_std": 0.4158325716853142,
"rewards/correctness_reward_func": 0.3125,
"rewards/strict_format_reward_func": 0.220703125,
"step": 54
},
{
"completion_length": 315.51953125,
"epoch": 0.06399069226294357,
"grad_norm": 113.13047615199778,
"kl": 0.79083251953125,
"learning_rate": 9.970124665076417e-07,
"loss": 0.0317,
"reward": 0.5625,
"reward_std": 0.5517344921827316,
"rewards/correctness_reward_func": 0.3203125,
"rewards/strict_format_reward_func": 0.2421875,
"step": 55
},
{
"completion_length": 313.81640625,
"epoch": 0.0651541593949971,
"grad_norm": 1.1795690608203528,
"kl": 0.047210693359375,
"learning_rate": 9.96803100978432e-07,
"loss": 0.0019,
"reward": 0.642578125,
"reward_std": 0.47613072395324707,
"rewards/correctness_reward_func": 0.34375,
"rewards/strict_format_reward_func": 0.298828125,
"step": 56
},
{
"completion_length": 348.234375,
"epoch": 0.06631762652705062,
"grad_norm": 13.371406300782917,
"kl": 0.11224365234375,
"learning_rate": 9.965866691255597e-07,
"loss": 0.0045,
"reward": 0.66796875,
"reward_std": 0.59251669049263,
"rewards/correctness_reward_func": 0.390625,
"rewards/strict_format_reward_func": 0.27734375,
"step": 57
},
{
"completion_length": 383.1796875,
"epoch": 0.06748109365910412,
"grad_norm": 1.6177251711954495,
"kl": 0.030792236328125,
"learning_rate": 9.963631740274622e-07,
"loss": 0.0012,
"reward": 0.5625,
"reward_std": 0.47907302528619766,
"rewards/correctness_reward_func": 0.2890625,
"rewards/strict_format_reward_func": 0.2734375,
"step": 58
},
{
"completion_length": 355.9921875,
"epoch": 0.06864456079115765,
"grad_norm": 47.382528946856574,
"kl": 0.235626220703125,
"learning_rate": 9.961326188630425e-07,
"loss": 0.0095,
"reward": 0.607421875,
"reward_std": 0.535815954208374,
"rewards/correctness_reward_func": 0.34375,
"rewards/strict_format_reward_func": 0.263671875,
"step": 59
},
{
"completion_length": 377.46875,
"epoch": 0.06980802792321117,
"grad_norm": 2.8906662162333294,
"kl": 0.076416015625,
"learning_rate": 9.95895006911623e-07,
"loss": 0.0031,
"reward": 0.6015625,
"reward_std": 0.4677914008498192,
"rewards/correctness_reward_func": 0.3046875,
"rewards/strict_format_reward_func": 0.296875,
"step": 60
},
{
"completion_length": 307.0859375,
"epoch": 0.07097149505526469,
"grad_norm": 5.658248453671423,
"kl": 0.06695556640625,
"learning_rate": 9.956503415528982e-07,
"loss": 0.0027,
"reward": 0.60546875,
"reward_std": 0.5141743049025536,
"rewards/correctness_reward_func": 0.2734375,
"rewards/strict_format_reward_func": 0.33203125,
"step": 61
},
{
"completion_length": 308.98046875,
"epoch": 0.07213496218731821,
"grad_norm": 4.0281675399539,
"kl": 0.06524658203125,
"learning_rate": 9.953986262668884e-07,
"loss": 0.0026,
"reward": 0.66015625,
"reward_std": 0.4833526313304901,
"rewards/correctness_reward_func": 0.28125,
"rewards/strict_format_reward_func": 0.37890625,
"step": 62
},
{
"completion_length": 251.5625,
"epoch": 0.07329842931937172,
"grad_norm": 6.4708893162037455,
"kl": 0.13079833984375,
"learning_rate": 9.951398646338883e-07,
"loss": 0.0052,
"reward": 0.720703125,
"reward_std": 0.4117320328950882,
"rewards/correctness_reward_func": 0.3515625,
"rewards/strict_format_reward_func": 0.369140625,
"step": 63
},
{
"completion_length": 299.58203125,
"epoch": 0.07446189645142524,
"grad_norm": 13.45087311669488,
"kl": 0.15289306640625,
"learning_rate": 9.948740603344172e-07,
"loss": 0.0061,
"reward": 0.81640625,
"reward_std": 0.6002717912197113,
"rewards/correctness_reward_func": 0.4765625,
"rewards/strict_format_reward_func": 0.33984375,
"step": 64
},
{
"completion_length": 302.1953125,
"epoch": 0.07562536358347877,
"grad_norm": 18.560697276437615,
"kl": 0.20904541015625,
"learning_rate": 9.946012171491668e-07,
"loss": 0.0083,
"reward": 0.796875,
"reward_std": 0.4715307354927063,
"rewards/correctness_reward_func": 0.4140625,
"rewards/strict_format_reward_func": 0.3828125,
"step": 65
},
{
"completion_length": 286.73828125,
"epoch": 0.07678883071553229,
"grad_norm": 14.086886383004378,
"kl": 0.04949951171875,
"learning_rate": 9.943213389589466e-07,
"loss": 0.002,
"reward": 0.82421875,
"reward_std": 0.5994473099708557,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.37890625,
"step": 66
},
{
"completion_length": 323.93359375,
"epoch": 0.07795229784758581,
"grad_norm": 19.144971581802082,
"kl": 0.1656494140625,
"learning_rate": 9.940344297446292e-07,
"loss": 0.0066,
"reward": 0.73828125,
"reward_std": 0.43859150260686874,
"rewards/correctness_reward_func": 0.3671875,
"rewards/strict_format_reward_func": 0.37109375,
"step": 67
},
{
"completion_length": 350.01171875,
"epoch": 0.07911576497963932,
"grad_norm": 77.24678279515392,
"kl": 0.53643798828125,
"learning_rate": 9.937404935870937e-07,
"loss": 0.0215,
"reward": 0.61328125,
"reward_std": 0.4140402674674988,
"rewards/correctness_reward_func": 0.21875,
"rewards/strict_format_reward_func": 0.39453125,
"step": 68
},
{
"completion_length": 305.87109375,
"epoch": 0.08027923211169284,
"grad_norm": 12.883736811037531,
"kl": 0.25457763671875,
"learning_rate": 9.934395346671673e-07,
"loss": 0.0102,
"reward": 0.705078125,
"reward_std": 0.48615749180316925,
"rewards/correctness_reward_func": 0.3125,
"rewards/strict_format_reward_func": 0.392578125,
"step": 69
},
{
"completion_length": 279.07421875,
"epoch": 0.08144269924374636,
"grad_norm": 4.8084774229944705,
"kl": 0.158447265625,
"learning_rate": 9.93131557265567e-07,
"loss": 0.0063,
"reward": 0.775390625,
"reward_std": 0.43615715205669403,
"rewards/correctness_reward_func": 0.390625,
"rewards/strict_format_reward_func": 0.384765625,
"step": 70
},
{
"completion_length": 271.90625,
"epoch": 0.08260616637579989,
"grad_norm": 209.3718112774744,
"kl": 1.3720703125,
"learning_rate": 9.928165657628363e-07,
"loss": 0.0552,
"reward": 0.771484375,
"reward_std": 0.4666217863559723,
"rewards/correctness_reward_func": 0.375,
"rewards/strict_format_reward_func": 0.396484375,
"step": 71
},
{
"completion_length": 280.234375,
"epoch": 0.08376963350785341,
"grad_norm": 8.675435114113355,
"kl": 0.08306884765625,
"learning_rate": 9.924945646392856e-07,
"loss": 0.0033,
"reward": 0.724609375,
"reward_std": 0.4121067523956299,
"rewards/correctness_reward_func": 0.328125,
"rewards/strict_format_reward_func": 0.396484375,
"step": 72
},
{
"completion_length": 298.13671875,
"epoch": 0.08493310063990692,
"grad_norm": 2.261657798977972,
"kl": 0.05487060546875,
"learning_rate": 9.92165558474927e-07,
"loss": 0.0022,
"reward": 0.79296875,
"reward_std": 0.4348938390612602,
"rewards/correctness_reward_func": 0.40625,
"rewards/strict_format_reward_func": 0.38671875,
"step": 73
},
{
"completion_length": 275.25390625,
"epoch": 0.08609656777196044,
"grad_norm": 306.8537482842207,
"kl": 2.76251220703125,
"learning_rate": 9.918295519494089e-07,
"loss": 0.1104,
"reward": 0.853515625,
"reward_std": 0.4975513890385628,
"rewards/correctness_reward_func": 0.4375,
"rewards/strict_format_reward_func": 0.416015625,
"step": 74
},
{
"completion_length": 303.42578125,
"epoch": 0.08726003490401396,
"grad_norm": 11.231963005062614,
"kl": 0.20123291015625,
"learning_rate": 9.91486549841951e-07,
"loss": 0.008,
"reward": 0.720703125,
"reward_std": 0.46410517394542694,
"rewards/correctness_reward_func": 0.3359375,
"rewards/strict_format_reward_func": 0.384765625,
"step": 75
},
{
"completion_length": 277.7578125,
"epoch": 0.08842350203606748,
"grad_norm": 160.47901765005733,
"kl": 1.1749267578125,
"learning_rate": 9.91136557031274e-07,
"loss": 0.0472,
"reward": 0.74609375,
"reward_std": 0.4058147594332695,
"rewards/correctness_reward_func": 0.3125,
"rewards/strict_format_reward_func": 0.43359375,
"step": 76
},
{
"completion_length": 340.8515625,
"epoch": 0.089586969168121,
"grad_norm": 381.85614779047495,
"kl": 1.22967529296875,
"learning_rate": 9.907795784955326e-07,
"loss": 0.0492,
"reward": 0.765625,
"reward_std": 0.42379553616046906,
"rewards/correctness_reward_func": 0.3515625,
"rewards/strict_format_reward_func": 0.4140625,
"step": 77
},
{
"completion_length": 274.44921875,
"epoch": 0.09075043630017451,
"grad_norm": 64.47137880223688,
"kl": 0.67572021484375,
"learning_rate": 9.904156193122431e-07,
"loss": 0.027,
"reward": 0.80078125,
"reward_std": 0.4361772760748863,
"rewards/correctness_reward_func": 0.3828125,
"rewards/strict_format_reward_func": 0.41796875,
"step": 78
},
{
"completion_length": 265.8359375,
"epoch": 0.09191390343222804,
"grad_norm": 27.903843212195564,
"kl": 0.46124267578125,
"learning_rate": 9.900446846582119e-07,
"loss": 0.0185,
"reward": 0.92578125,
"reward_std": 0.5412914976477623,
"rewards/correctness_reward_func": 0.5078125,
"rewards/strict_format_reward_func": 0.41796875,
"step": 79
},
{
"completion_length": 320.8984375,
"epoch": 0.09307737056428156,
"grad_norm": 1.9761659190639995,
"kl": 0.04864501953125,
"learning_rate": 9.896667798094608e-07,
"loss": 0.0019,
"reward": 0.76953125,
"reward_std": 0.4052440747618675,
"rewards/correctness_reward_func": 0.375,
"rewards/strict_format_reward_func": 0.39453125,
"step": 80
},
{
"completion_length": 312.84765625,
"epoch": 0.09424083769633508,
"grad_norm": 57.910687963151915,
"kl": 0.12591552734375,
"learning_rate": 9.892819101411543e-07,
"loss": 0.005,
"reward": 0.75,
"reward_std": 0.49195902049541473,
"rewards/correctness_reward_func": 0.3359375,
"rewards/strict_format_reward_func": 0.4140625,
"step": 81
},
{
"completion_length": 271.83984375,
"epoch": 0.0954043048283886,
"grad_norm": 5.41836244806774,
"kl": 0.08587646484375,
"learning_rate": 9.888900811275203e-07,
"loss": 0.0034,
"reward": 0.71484375,
"reward_std": 0.43381768465042114,
"rewards/correctness_reward_func": 0.2890625,
"rewards/strict_format_reward_func": 0.42578125,
"step": 82
},
{
"completion_length": 290.12109375,
"epoch": 0.09656777196044211,
"grad_norm": 0.8646498967374704,
"kl": 0.0540771484375,
"learning_rate": 9.884912983417743e-07,
"loss": 0.0022,
"reward": 0.892578125,
"reward_std": 0.47797150164842606,
"rewards/correctness_reward_func": 0.46875,
"rewards/strict_format_reward_func": 0.423828125,
"step": 83
},
{
"completion_length": 224.84375,
"epoch": 0.09773123909249563,
"grad_norm": 2.249327867839398,
"kl": 0.0767822265625,
"learning_rate": 9.88085567456039e-07,
"loss": 0.0031,
"reward": 0.927734375,
"reward_std": 0.5308554023504257,
"rewards/correctness_reward_func": 0.4765625,
"rewards/strict_format_reward_func": 0.451171875,
"step": 84
},
{
"completion_length": 268.13671875,
"epoch": 0.09889470622454916,
"grad_norm": 5.4143471770769445,
"kl": 0.10394287109375,
"learning_rate": 9.876728942412642e-07,
"loss": 0.0042,
"reward": 0.849609375,
"reward_std": 0.44555214792490005,
"rewards/correctness_reward_func": 0.4140625,
"rewards/strict_format_reward_func": 0.435546875,
"step": 85
},
{
"completion_length": 310.32421875,
"epoch": 0.10005817335660268,
"grad_norm": 99.7419314506415,
"kl": 0.43487548828125,
"learning_rate": 9.872532845671449e-07,
"loss": 0.0174,
"reward": 0.619140625,
"reward_std": 0.2979493774473667,
"rewards/correctness_reward_func": 0.203125,
"rewards/strict_format_reward_func": 0.416015625,
"step": 86
},
{
"completion_length": 336.62109375,
"epoch": 0.1012216404886562,
"grad_norm": 2.7658118091947093,
"kl": 0.0557861328125,
"learning_rate": 9.868267444020366e-07,
"loss": 0.0022,
"reward": 0.84375,
"reward_std": 0.4424229711294174,
"rewards/correctness_reward_func": 0.4140625,
"rewards/strict_format_reward_func": 0.4296875,
"step": 87
},
{
"completion_length": 267.109375,
"epoch": 0.10238510762070971,
"grad_norm": 1.3296554071596351,
"kl": 0.05279541015625,
"learning_rate": 9.86393279812872e-07,
"loss": 0.0021,
"reward": 0.763671875,
"reward_std": 0.40575000643730164,
"rewards/correctness_reward_func": 0.3203125,
"rewards/strict_format_reward_func": 0.443359375,
"step": 88
},
{
"completion_length": 286.4453125,
"epoch": 0.10354857475276323,
"grad_norm": 1.9800959128263669,
"kl": 0.0650634765625,
"learning_rate": 9.859528969650737e-07,
"loss": 0.0026,
"reward": 0.70703125,
"reward_std": 0.36570068448781967,
"rewards/correctness_reward_func": 0.2734375,
"rewards/strict_format_reward_func": 0.43359375,
"step": 89
},
{
"completion_length": 315.234375,
"epoch": 0.10471204188481675,
"grad_norm": 0.9855921470718585,
"kl": 0.057861328125,
"learning_rate": 9.855056021224671e-07,
"loss": 0.0023,
"reward": 0.78125,
"reward_std": 0.38219955191016197,
"rewards/correctness_reward_func": 0.359375,
"rewards/strict_format_reward_func": 0.421875,
"step": 90
},
{
"completion_length": 283.63671875,
"epoch": 0.10587550901687028,
"grad_norm": 2.2979566856097633,
"kl": 0.05584716796875,
"learning_rate": 9.850514016471902e-07,
"loss": 0.0022,
"reward": 0.814453125,
"reward_std": 0.4690292477607727,
"rewards/correctness_reward_func": 0.375,
"rewards/strict_format_reward_func": 0.439453125,
"step": 91
},
{
"completion_length": 280.4921875,
"epoch": 0.1070389761489238,
"grad_norm": 54.88062525909732,
"kl": 0.2271728515625,
"learning_rate": 9.845903019996045e-07,
"loss": 0.0091,
"reward": 0.732421875,
"reward_std": 0.3464353382587433,
"rewards/correctness_reward_func": 0.296875,
"rewards/strict_format_reward_func": 0.435546875,
"step": 92
},
{
"completion_length": 257.05078125,
"epoch": 0.1082024432809773,
"grad_norm": 2.4355438627389714,
"kl": 0.16082763671875,
"learning_rate": 9.841223097382027e-07,
"loss": 0.0065,
"reward": 0.921875,
"reward_std": 0.5120889246463776,
"rewards/correctness_reward_func": 0.484375,
"rewards/strict_format_reward_func": 0.4375,
"step": 93
},
{
"completion_length": 330.67578125,
"epoch": 0.10936591041303083,
"grad_norm": 0.29946740274215317,
"kl": 0.04864501953125,
"learning_rate": 9.836474315195147e-07,
"loss": 0.0019,
"reward": 0.734375,
"reward_std": 0.4280121922492981,
"rewards/correctness_reward_func": 0.3125,
"rewards/strict_format_reward_func": 0.421875,
"step": 94
},
{
"completion_length": 293.5234375,
"epoch": 0.11052937754508435,
"grad_norm": 0.3362629809809694,
"kl": 0.045654296875,
"learning_rate": 9.831656740980135e-07,
"loss": 0.0018,
"reward": 0.716796875,
"reward_std": 0.4403715208172798,
"rewards/correctness_reward_func": 0.296875,
"rewards/strict_format_reward_func": 0.419921875,
"step": 95
},
{
"completion_length": 237.234375,
"epoch": 0.11169284467713787,
"grad_norm": 0.3082617927162893,
"kl": 0.05224609375,
"learning_rate": 9.826770443260193e-07,
"loss": 0.0021,
"reward": 0.818359375,
"reward_std": 0.431897908449173,
"rewards/correctness_reward_func": 0.3828125,
"rewards/strict_format_reward_func": 0.435546875,
"step": 96
},
{
"completion_length": 272.08984375,
"epoch": 0.1128563118091914,
"grad_norm": 3.297879330447764,
"kl": 0.06298828125,
"learning_rate": 9.821815491536016e-07,
"loss": 0.0025,
"reward": 0.86328125,
"reward_std": 0.4964246600866318,
"rewards/correctness_reward_func": 0.4296875,
"rewards/strict_format_reward_func": 0.43359375,
"step": 97
},
{
"completion_length": 313.859375,
"epoch": 0.1140197789412449,
"grad_norm": 1.7917369827084497,
"kl": 0.05108642578125,
"learning_rate": 9.81679195628481e-07,
"loss": 0.002,
"reward": 0.69140625,
"reward_std": 0.4303680807352066,
"rewards/correctness_reward_func": 0.265625,
"rewards/strict_format_reward_func": 0.42578125,
"step": 98
},
{
"completion_length": 234.34375,
"epoch": 0.11518324607329843,
"grad_norm": 2.373852143479517,
"kl": 0.07373046875,
"learning_rate": 9.811699908959275e-07,
"loss": 0.0029,
"reward": 0.927734375,
"reward_std": 0.4275398887693882,
"rewards/correctness_reward_func": 0.4765625,
"rewards/strict_format_reward_func": 0.451171875,
"step": 99
},
{
"completion_length": 267.76953125,
"epoch": 0.11634671320535195,
"grad_norm": 1.5428375901598974,
"kl": 0.05035400390625,
"learning_rate": 9.806539421986608e-07,
"loss": 0.002,
"reward": 0.763671875,
"reward_std": 0.45994649082422256,
"rewards/correctness_reward_func": 0.3203125,
"rewards/strict_format_reward_func": 0.443359375,
"step": 100
},
{
"completion_length": 245.25390625,
"epoch": 0.11751018033740547,
"grad_norm": 2.3658036123831083,
"kl": 0.04681396484375,
"learning_rate": 9.80131056876746e-07,
"loss": 0.0019,
"reward": 0.8828125,
"reward_std": 0.43228569626808167,
"rewards/correctness_reward_func": 0.4375,
"rewards/strict_format_reward_func": 0.4453125,
"step": 101
},
{
"completion_length": 229.6953125,
"epoch": 0.11867364746945899,
"grad_norm": 7.873285555480436,
"kl": 0.05926513671875,
"learning_rate": 9.796013423674898e-07,
"loss": 0.0024,
"reward": 0.75,
"reward_std": 0.3967752233147621,
"rewards/correctness_reward_func": 0.328125,
"rewards/strict_format_reward_func": 0.421875,
"step": 102
},
{
"completion_length": 344.07421875,
"epoch": 0.1198371146015125,
"grad_norm": 15.56401233276747,
"kl": 0.1546630859375,
"learning_rate": 9.79064806205334e-07,
"loss": 0.0062,
"reward": 0.810546875,
"reward_std": 0.44128578901290894,
"rewards/correctness_reward_func": 0.3828125,
"rewards/strict_format_reward_func": 0.427734375,
"step": 103
},
{
"completion_length": 282.15625,
"epoch": 0.12100058173356602,
"grad_norm": 5.111153394673512,
"kl": 0.0589599609375,
"learning_rate": 9.78521456021749e-07,
"loss": 0.0024,
"reward": 0.80078125,
"reward_std": 0.48010821640491486,
"rewards/correctness_reward_func": 0.3671875,
"rewards/strict_format_reward_func": 0.43359375,
"step": 104
},
{
"completion_length": 242.20703125,
"epoch": 0.12216404886561955,
"grad_norm": 6.003504446851319,
"kl": 0.05206298828125,
"learning_rate": 9.779712995451252e-07,
"loss": 0.0021,
"reward": 0.80859375,
"reward_std": 0.443262055516243,
"rewards/correctness_reward_func": 0.375,
"rewards/strict_format_reward_func": 0.43359375,
"step": 105
},
{
"completion_length": 276.546875,
"epoch": 0.12332751599767307,
"grad_norm": 4.073486002796206,
"kl": 0.054443359375,
"learning_rate": 9.77414344600663e-07,
"loss": 0.0022,
"reward": 0.767578125,
"reward_std": 0.46208247542381287,
"rewards/correctness_reward_func": 0.3359375,
"rewards/strict_format_reward_func": 0.431640625,
"step": 106
},
{
"completion_length": 257.03515625,
"epoch": 0.12449098312972659,
"grad_norm": 18.5627783152072,
"kl": 0.087646484375,
"learning_rate": 9.76850599110261e-07,
"loss": 0.0035,
"reward": 0.794921875,
"reward_std": 0.4035182222723961,
"rewards/correctness_reward_func": 0.3671875,
"rewards/strict_format_reward_func": 0.427734375,
"step": 107
},
{
"completion_length": 312.40625,
"epoch": 0.1256544502617801,
"grad_norm": 6.982791090576697,
"kl": 0.115966796875,
"learning_rate": 9.762800710924038e-07,
"loss": 0.0046,
"reward": 0.783203125,
"reward_std": 0.5103202238678932,
"rewards/correctness_reward_func": 0.375,
"rewards/strict_format_reward_func": 0.408203125,
"step": 108
},
{
"completion_length": 266.125,
"epoch": 0.12681791739383363,
"grad_norm": 1.474075928860927,
"kl": 0.04534912109375,
"learning_rate": 9.75702768662048e-07,
"loss": 0.0018,
"reward": 0.828125,
"reward_std": 0.398033931851387,
"rewards/correctness_reward_func": 0.390625,
"rewards/strict_format_reward_func": 0.4375,
"step": 109
},
{
"completion_length": 291.98828125,
"epoch": 0.12798138452588714,
"grad_norm": 1082.0566595703972,
"kl": 2.355712890625,
"learning_rate": 9.751187000305074e-07,
"loss": 0.0937,
"reward": 0.890625,
"reward_std": 0.4832841530442238,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.4296875,
"step": 110
},
{
"completion_length": 257.46484375,
"epoch": 0.12914485165794065,
"grad_norm": 8.260881771838365,
"kl": 0.305908203125,
"learning_rate": 9.745278735053343e-07,
"loss": 0.0122,
"reward": 0.791015625,
"reward_std": 0.5271246433258057,
"rewards/correctness_reward_func": 0.3671875,
"rewards/strict_format_reward_func": 0.423828125,
"step": 111
},
{
"completion_length": 278.26171875,
"epoch": 0.1303083187899942,
"grad_norm": 7.127367987174519,
"kl": 0.2362060546875,
"learning_rate": 9.73930297490203e-07,
"loss": 0.0094,
"reward": 0.650390625,
"reward_std": 0.3026326783001423,
"rewards/correctness_reward_func": 0.2421875,
"rewards/strict_format_reward_func": 0.408203125,
"step": 112
},
{
"completion_length": 264.4765625,
"epoch": 0.1314717859220477,
"grad_norm": 5.665522812893531,
"kl": 0.190673828125,
"learning_rate": 9.7332598048479e-07,
"loss": 0.0076,
"reward": 0.966796875,
"reward_std": 0.4744589924812317,
"rewards/correctness_reward_func": 0.5390625,
"rewards/strict_format_reward_func": 0.427734375,
"step": 113
},
{
"completion_length": 313.296875,
"epoch": 0.13263525305410123,
"grad_norm": 0.6167995629967288,
"kl": 0.0430908203125,
"learning_rate": 9.727149310846523e-07,
"loss": 0.0017,
"reward": 0.662109375,
"reward_std": 0.4723682776093483,
"rewards/correctness_reward_func": 0.25,
"rewards/strict_format_reward_func": 0.412109375,
"step": 114
},
{
"completion_length": 238.8828125,
"epoch": 0.13379872018615474,
"grad_norm": 5932.568937857461,
"kl": 31.500244140625,
"learning_rate": 9.720971579811065e-07,
"loss": 1.2647,
"reward": 0.939453125,
"reward_std": 0.4201255813241005,
"rewards/correctness_reward_func": 0.4921875,
"rewards/strict_format_reward_func": 0.447265625,
"step": 115
},
{
"completion_length": 270.9375,
"epoch": 0.13496218731820825,
"grad_norm": 2.0818128486642893,
"kl": 0.11199951171875,
"learning_rate": 9.714726699611037e-07,
"loss": 0.0045,
"reward": 0.6328125,
"reward_std": 0.34041667729616165,
"rewards/correctness_reward_func": 0.234375,
"rewards/strict_format_reward_func": 0.3984375,
"step": 116
},
{
"completion_length": 244.61328125,
"epoch": 0.13612565445026178,
"grad_norm": 29.72218794430255,
"kl": 0.26605224609375,
"learning_rate": 9.708414759071057e-07,
"loss": 0.0106,
"reward": 0.853515625,
"reward_std": 0.4693729430437088,
"rewards/correctness_reward_func": 0.4140625,
"rewards/strict_format_reward_func": 0.439453125,
"step": 117
},
{
"completion_length": 269.69140625,
"epoch": 0.1372891215823153,
"grad_norm": 9.459551717392767,
"kl": 0.16796875,
"learning_rate": 9.702035847969578e-07,
"loss": 0.0067,
"reward": 0.802734375,
"reward_std": 0.45168111473321915,
"rewards/correctness_reward_func": 0.3671875,
"rewards/strict_format_reward_func": 0.435546875,
"step": 118
},
{
"completion_length": 263.65625,
"epoch": 0.13845258871436883,
"grad_norm": 3.6379354568837274,
"kl": 0.19122314453125,
"learning_rate": 9.695590057037618e-07,
"loss": 0.0077,
"reward": 0.71875,
"reward_std": 0.4168992340564728,
"rewards/correctness_reward_func": 0.2890625,
"rewards/strict_format_reward_func": 0.4296875,
"step": 119
},
{
"completion_length": 255.9921875,
"epoch": 0.13961605584642234,
"grad_norm": 6.956294376892292,
"kl": 0.4310302734375,
"learning_rate": 9.689077477957468e-07,
"loss": 0.0172,
"reward": 0.85546875,
"reward_std": 0.5010552629828453,
"rewards/correctness_reward_func": 0.4375,
"rewards/strict_format_reward_func": 0.41796875,
"step": 120
},
{
"completion_length": 302.72265625,
"epoch": 0.14077952297847585,
"grad_norm": 3.7055581750620203,
"kl": 0.0943603515625,
"learning_rate": 9.682498203361378e-07,
"loss": 0.0038,
"reward": 0.6796875,
"reward_std": 0.431684710085392,
"rewards/correctness_reward_func": 0.2734375,
"rewards/strict_format_reward_func": 0.40625,
"step": 121
},
{
"completion_length": 287.06640625,
"epoch": 0.14194299011052938,
"grad_norm": 3.6994429011170293,
"kl": 0.097412109375,
"learning_rate": 9.675852326830254e-07,
"loss": 0.0039,
"reward": 0.599609375,
"reward_std": 0.2685260437428951,
"rewards/correctness_reward_func": 0.1796875,
"rewards/strict_format_reward_func": 0.419921875,
"step": 122
},
{
"completion_length": 244.97265625,
"epoch": 0.1431064572425829,
"grad_norm": 5.330519684929318,
"kl": 0.474609375,
"learning_rate": 9.669139942892323e-07,
"loss": 0.019,
"reward": 0.904296875,
"reward_std": 0.45676978677511215,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.443359375,
"step": 123
},
{
"completion_length": 244.953125,
"epoch": 0.14426992437463643,
"grad_norm": 7.439747574901674,
"kl": 0.082763671875,
"learning_rate": 9.66236114702178e-07,
"loss": 0.0033,
"reward": 0.830078125,
"reward_std": 0.4560399353504181,
"rewards/correctness_reward_func": 0.3828125,
"rewards/strict_format_reward_func": 0.447265625,
"step": 124
},
{
"completion_length": 247.43359375,
"epoch": 0.14543339150668994,
"grad_norm": 8.444741434329583,
"kl": 0.191650390625,
"learning_rate": 9.655516035637436e-07,
"loss": 0.0077,
"reward": 0.7890625,
"reward_std": 0.39387788623571396,
"rewards/correctness_reward_func": 0.359375,
"rewards/strict_format_reward_func": 0.4296875,
"step": 125
},
{
"completion_length": 286.15625,
"epoch": 0.14659685863874344,
"grad_norm": 1.7413985122485363,
"kl": 0.1680908203125,
"learning_rate": 9.648604706101354e-07,
"loss": 0.0067,
"reward": 0.775390625,
"reward_std": 0.4273899048566818,
"rewards/correctness_reward_func": 0.3515625,
"rewards/strict_format_reward_func": 0.423828125,
"step": 126
},
{
"completion_length": 266.35546875,
"epoch": 0.14776032577079698,
"grad_norm": 23.48576289003154,
"kl": 0.429443359375,
"learning_rate": 9.641627256717452e-07,
"loss": 0.0171,
"reward": 0.904296875,
"reward_std": 0.4860465005040169,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.443359375,
"step": 127
},
{
"completion_length": 280.5,
"epoch": 0.1489237929028505,
"grad_norm": 3.434249282079891,
"kl": 0.12042236328125,
"learning_rate": 9.634583786730108e-07,
"loss": 0.0048,
"reward": 0.912109375,
"reward_std": 0.4505278691649437,
"rewards/correctness_reward_func": 0.4765625,
"rewards/strict_format_reward_func": 0.435546875,
"step": 128
},
{
"completion_length": 252.86328125,
"epoch": 0.15008726003490402,
"grad_norm": 0.509688518372827,
"kl": 0.05780029296875,
"learning_rate": 9.627474396322753e-07,
"loss": 0.0023,
"reward": 0.833984375,
"reward_std": 0.3295654430985451,
"rewards/correctness_reward_func": 0.390625,
"rewards/strict_format_reward_func": 0.443359375,
"step": 129
},
{
"completion_length": 244.61328125,
"epoch": 0.15125072716695753,
"grad_norm": 322.8549051555672,
"kl": 0.65301513671875,
"learning_rate": 9.62029918661644e-07,
"loss": 0.0261,
"reward": 0.849609375,
"reward_std": 0.40418654680252075,
"rewards/correctness_reward_func": 0.3984375,
"rewards/strict_format_reward_func": 0.451171875,
"step": 130
},
{
"completion_length": 274.83203125,
"epoch": 0.15241419429901104,
"grad_norm": 2.5471313815643635,
"kl": 0.08868408203125,
"learning_rate": 9.613058259668414e-07,
"loss": 0.0035,
"reward": 0.732421875,
"reward_std": 0.3115438222885132,
"rewards/correctness_reward_func": 0.28125,
"rewards/strict_format_reward_func": 0.451171875,
"step": 131
},
{
"completion_length": 263.140625,
"epoch": 0.15357766143106458,
"grad_norm": 0.4917581912639935,
"kl": 0.05181884765625,
"learning_rate": 9.60575171847065e-07,
"loss": 0.0021,
"reward": 0.8828125,
"reward_std": 0.3844335228204727,
"rewards/correctness_reward_func": 0.421875,
"rewards/strict_format_reward_func": 0.4609375,
"step": 132
},
{
"completion_length": 240.77734375,
"epoch": 0.15474112856311809,
"grad_norm": 1.7256071643625732,
"kl": 0.05010986328125,
"learning_rate": 9.598379666948393e-07,
"loss": 0.002,
"reward": 0.923828125,
"reward_std": 0.44481250643730164,
"rewards/correctness_reward_func": 0.46875,
"rewards/strict_format_reward_func": 0.455078125,
"step": 133
},
{
"completion_length": 262.2890625,
"epoch": 0.15590459569517162,
"grad_norm": 1.803011519361335,
"kl": 0.06396484375,
"learning_rate": 9.590942209958686e-07,
"loss": 0.0026,
"reward": 0.951171875,
"reward_std": 0.5317578241229057,
"rewards/correctness_reward_func": 0.5078125,
"rewards/strict_format_reward_func": 0.443359375,
"step": 134
},
{
"completion_length": 264.09375,
"epoch": 0.15706806282722513,
"grad_norm": 2.320201832882074,
"kl": 0.052978515625,
"learning_rate": 9.583439453288864e-07,
"loss": 0.0021,
"reward": 0.7890625,
"reward_std": 0.4296160414814949,
"rewards/correctness_reward_func": 0.3515625,
"rewards/strict_format_reward_func": 0.4375,
"step": 135
},
{
"completion_length": 265.18359375,
"epoch": 0.15823152995927864,
"grad_norm": 0.8648510710425397,
"kl": 0.0433349609375,
"learning_rate": 9.575871503655067e-07,
"loss": 0.0017,
"reward": 1.017578125,
"reward_std": 0.5687045827507973,
"rewards/correctness_reward_func": 0.5546875,
"rewards/strict_format_reward_func": 0.462890625,
"step": 136
},
{
"completion_length": 313.328125,
"epoch": 0.15939499709133217,
"grad_norm": 7.228345785913302,
"kl": 0.09490966796875,
"learning_rate": 9.568238468700705e-07,
"loss": 0.0038,
"reward": 0.625,
"reward_std": 0.32062922045588493,
"rewards/correctness_reward_func": 0.1875,
"rewards/strict_format_reward_func": 0.4375,
"step": 137
},
{
"completion_length": 272.78125,
"epoch": 0.16055846422338568,
"grad_norm": 4.791778485714245,
"kl": 0.06451416015625,
"learning_rate": 9.560540456994939e-07,
"loss": 0.0026,
"reward": 0.73828125,
"reward_std": 0.3905741199851036,
"rewards/correctness_reward_func": 0.2734375,
"rewards/strict_format_reward_func": 0.46484375,
"step": 138
},
{
"completion_length": 284.91015625,
"epoch": 0.16172193135543922,
"grad_norm": 1.0070605023468475,
"kl": 0.0789794921875,
"learning_rate": 9.552777578031133e-07,
"loss": 0.0032,
"reward": 0.86328125,
"reward_std": 0.45894617587327957,
"rewards/correctness_reward_func": 0.4140625,
"rewards/strict_format_reward_func": 0.44921875,
"step": 139
},
{
"completion_length": 254.89453125,
"epoch": 0.16288539848749273,
"grad_norm": 0.26164597278669394,
"kl": 0.0531005859375,
"learning_rate": 9.544949942225295e-07,
"loss": 0.0021,
"reward": 0.888671875,
"reward_std": 0.4409971535205841,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.443359375,
"step": 140
},
{
"completion_length": 297.02734375,
"epoch": 0.16404886561954624,
"grad_norm": 49.61137761643266,
"kl": 0.19952392578125,
"learning_rate": 9.537057660914508e-07,
"loss": 0.008,
"reward": 0.78515625,
"reward_std": 0.2939911261200905,
"rewards/correctness_reward_func": 0.3359375,
"rewards/strict_format_reward_func": 0.44921875,
"step": 141
},
{
"completion_length": 228.41015625,
"epoch": 0.16521233275159977,
"grad_norm": 2.4563226616711726,
"kl": 0.13043212890625,
"learning_rate": 9.529100846355345e-07,
"loss": 0.0052,
"reward": 0.982421875,
"reward_std": 0.36371277645230293,
"rewards/correctness_reward_func": 0.515625,
"rewards/strict_format_reward_func": 0.466796875,
"step": 142
},
{
"completion_length": 253.48828125,
"epoch": 0.16637579988365328,
"grad_norm": 1.7492293672269494,
"kl": 0.06890869140625,
"learning_rate": 9.521079611722276e-07,
"loss": 0.0028,
"reward": 0.892578125,
"reward_std": 0.3547302335500717,
"rewards/correctness_reward_func": 0.4296875,
"rewards/strict_format_reward_func": 0.462890625,
"step": 143
},
{
"completion_length": 270.91015625,
"epoch": 0.16753926701570682,
"grad_norm": 186.25321871346733,
"kl": 2.019775390625,
"learning_rate": 9.512994071106054e-07,
"loss": 0.0808,
"reward": 0.876953125,
"reward_std": 0.35532546043395996,
"rewards/correctness_reward_func": 0.421875,
"rewards/strict_format_reward_func": 0.455078125,
"step": 144
},
{
"completion_length": 251.24609375,
"epoch": 0.16870273414776032,
"grad_norm": 20.433967338967815,
"kl": 0.61883544921875,
"learning_rate": 9.504844339512094e-07,
"loss": 0.0247,
"reward": 1.021484375,
"reward_std": 0.4558466151356697,
"rewards/correctness_reward_func": 0.5546875,
"rewards/strict_format_reward_func": 0.466796875,
"step": 145
},
{
"completion_length": 258.37109375,
"epoch": 0.16986620127981383,
"grad_norm": 1.2588068639439651,
"kl": 0.07843017578125,
"learning_rate": 9.49663053285884e-07,
"loss": 0.0031,
"reward": 0.8203125,
"reward_std": 0.40197764337062836,
"rewards/correctness_reward_func": 0.3515625,
"rewards/strict_format_reward_func": 0.46875,
"step": 146
},
{
"completion_length": 249.65625,
"epoch": 0.17102966841186737,
"grad_norm": 26.616164942367558,
"kl": 0.16827392578125,
"learning_rate": 9.488352767976109e-07,
"loss": 0.0067,
"reward": 0.951171875,
"reward_std": 0.43833911418914795,
"rewards/correctness_reward_func": 0.484375,
"rewards/strict_format_reward_func": 0.466796875,
"step": 147
},
{
"completion_length": 266.21484375,
"epoch": 0.17219313554392088,
"grad_norm": 16.56862163073421,
"kl": 0.13897705078125,
"learning_rate": 9.480011162603434e-07,
"loss": 0.0056,
"reward": 0.994140625,
"reward_std": 0.41204124689102173,
"rewards/correctness_reward_func": 0.5390625,
"rewards/strict_format_reward_func": 0.455078125,
"step": 148
},
{
"completion_length": 305.5078125,
"epoch": 0.1733566026759744,
"grad_norm": 0.6862061857964759,
"kl": 0.0863037109375,
"learning_rate": 9.471605835388392e-07,
"loss": 0.0035,
"reward": 0.7421875,
"reward_std": 0.3801525831222534,
"rewards/correctness_reward_func": 0.28125,
"rewards/strict_format_reward_func": 0.4609375,
"step": 149
},
{
"completion_length": 257.46484375,
"epoch": 0.17452006980802792,
"grad_norm": 7.269994609894895,
"kl": 0.1292724609375,
"learning_rate": 9.463136905884912e-07,
"loss": 0.0052,
"reward": 0.96484375,
"reward_std": 0.4492557644844055,
"rewards/correctness_reward_func": 0.515625,
"rewards/strict_format_reward_func": 0.44921875,
"step": 150
},
{
"completion_length": 269.76171875,
"epoch": 0.17568353694008143,
"grad_norm": 3.546788791129262,
"kl": 0.2747802734375,
"learning_rate": 9.454604494551577e-07,
"loss": 0.011,
"reward": 0.794921875,
"reward_std": 0.49609148502349854,
"rewards/correctness_reward_func": 0.34375,
"rewards/strict_format_reward_func": 0.451171875,
"step": 151
},
{
"completion_length": 241.5078125,
"epoch": 0.17684700407213497,
"grad_norm": 19.898589503994543,
"kl": 0.2518310546875,
"learning_rate": 9.446008722749905e-07,
"loss": 0.0101,
"reward": 0.912109375,
"reward_std": 0.3499249704182148,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.466796875,
"step": 152
},
{
"completion_length": 312.5625,
"epoch": 0.17801047120418848,
"grad_norm": 7.082944795431713,
"kl": 0.22540283203125,
"learning_rate": 9.437349712742634e-07,
"loss": 0.009,
"reward": 0.93359375,
"reward_std": 0.38279156386852264,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.47265625,
"step": 153
},
{
"completion_length": 290.33203125,
"epoch": 0.179173938336242,
"grad_norm": 2.204011656968964,
"kl": 0.066650390625,
"learning_rate": 9.428627587691971e-07,
"loss": 0.0027,
"reward": 0.783203125,
"reward_std": 0.37652380019426346,
"rewards/correctness_reward_func": 0.3359375,
"rewards/strict_format_reward_func": 0.447265625,
"step": 154
},
{
"completion_length": 291.9921875,
"epoch": 0.18033740546829552,
"grad_norm": 28.05008573486938,
"kl": 0.36279296875,
"learning_rate": 9.419842471657846e-07,
"loss": 0.0145,
"reward": 0.794921875,
"reward_std": 0.45617077499628067,
"rewards/correctness_reward_func": 0.3671875,
"rewards/strict_format_reward_func": 0.427734375,
"step": 155
},
{
"completion_length": 286.06640625,
"epoch": 0.18150087260034903,
"grad_norm": 43.77812238882512,
"kl": 2.6201171875,
"learning_rate": 9.410994489596153e-07,
"loss": 0.105,
"reward": 0.904296875,
"reward_std": 0.4255755990743637,
"rewards/correctness_reward_func": 0.453125,
"rewards/strict_format_reward_func": 0.451171875,
"step": 156
},
{
"completion_length": 241.26953125,
"epoch": 0.18266433973240256,
"grad_norm": 10.84079560138156,
"kl": 1.619384765625,
"learning_rate": 9.402083767356957e-07,
"loss": 0.0646,
"reward": 0.904296875,
"reward_std": 0.3274926654994488,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.458984375,
"step": 157
},
{
"completion_length": 248.51171875,
"epoch": 0.18382780686445607,
"grad_norm": 65.24252743780556,
"kl": 2.162109375,
"learning_rate": 9.393110431682721e-07,
"loss": 0.0867,
"reward": 0.85546875,
"reward_std": 0.3152204602956772,
"rewards/correctness_reward_func": 0.390625,
"rewards/strict_format_reward_func": 0.46484375,
"step": 158
},
{
"completion_length": 283.03125,
"epoch": 0.1849912739965096,
"grad_norm": 47.62423123432252,
"kl": 0.57366943359375,
"learning_rate": 9.384074610206493e-07,
"loss": 0.023,
"reward": 0.95703125,
"reward_std": 0.458492249250412,
"rewards/correctness_reward_func": 0.5,
"rewards/strict_format_reward_func": 0.45703125,
"step": 159
},
{
"completion_length": 286.9921875,
"epoch": 0.18615474112856312,
"grad_norm": 18.91448421977711,
"kl": 1.2669677734375,
"learning_rate": 9.374976431450094e-07,
"loss": 0.0508,
"reward": 0.943359375,
"reward_std": 0.33836888894438744,
"rewards/correctness_reward_func": 0.484375,
"rewards/strict_format_reward_func": 0.458984375,
"step": 160
},
{
"completion_length": 279.0,
"epoch": 0.18731820826061663,
"grad_norm": 706.4286890306558,
"kl": 2.40673828125,
"learning_rate": 9.365816024822288e-07,
"loss": 0.0961,
"reward": 0.9609375,
"reward_std": 0.39441975951194763,
"rewards/correctness_reward_func": 0.5078125,
"rewards/strict_format_reward_func": 0.453125,
"step": 161
},
{
"completion_length": 292.0234375,
"epoch": 0.18848167539267016,
"grad_norm": 17.463248908977885,
"kl": 0.15557861328125,
"learning_rate": 9.356593520616946e-07,
"loss": 0.0062,
"reward": 0.943359375,
"reward_std": 0.3664560765028,
"rewards/correctness_reward_func": 0.4921875,
"rewards/strict_format_reward_func": 0.451171875,
"step": 162
},
{
"completion_length": 331.6328125,
"epoch": 0.18964514252472367,
"grad_norm": 2.249132481438475,
"kl": 0.0911865234375,
"learning_rate": 9.347309050011186e-07,
"loss": 0.0036,
"reward": 0.765625,
"reward_std": 0.37423864006996155,
"rewards/correctness_reward_func": 0.3125,
"rewards/strict_format_reward_func": 0.453125,
"step": 163
},
{
"completion_length": 297.85546875,
"epoch": 0.1908086096567772,
"grad_norm": 0.5787880880323766,
"kl": 0.0926513671875,
"learning_rate": 9.337962745063512e-07,
"loss": 0.0037,
"reward": 0.810546875,
"reward_std": 0.49850574135780334,
"rewards/correctness_reward_func": 0.359375,
"rewards/strict_format_reward_func": 0.451171875,
"step": 164
},
{
"completion_length": 265.62109375,
"epoch": 0.19197207678883071,
"grad_norm": 2.6837086070654985,
"kl": 0.05865478515625,
"learning_rate": 9.328554738711935e-07,
"loss": 0.0023,
"reward": 0.921875,
"reward_std": 0.513519324362278,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.4609375,
"step": 165
},
{
"completion_length": 246.55859375,
"epoch": 0.19313554392088422,
"grad_norm": 3.4134385495340216,
"kl": 0.2457275390625,
"learning_rate": 9.31908516477208e-07,
"loss": 0.0098,
"reward": 0.89453125,
"reward_std": 0.4189570024609566,
"rewards/correctness_reward_func": 0.4375,
"rewards/strict_format_reward_func": 0.45703125,
"step": 166
},
{
"completion_length": 246.66015625,
"epoch": 0.19429901105293776,
"grad_norm": 0.9230642941742443,
"kl": 0.11737060546875,
"learning_rate": 9.309554157935286e-07,
"loss": 0.0047,
"reward": 0.96875,
"reward_std": 0.4920966625213623,
"rewards/correctness_reward_func": 0.5078125,
"rewards/strict_format_reward_func": 0.4609375,
"step": 167
},
{
"completion_length": 225.83984375,
"epoch": 0.19546247818499127,
"grad_norm": 2.8669767245130577,
"kl": 0.05242919921875,
"learning_rate": 9.299961853766689e-07,
"loss": 0.0021,
"reward": 0.978515625,
"reward_std": 0.443370521068573,
"rewards/correctness_reward_func": 0.5234375,
"rewards/strict_format_reward_func": 0.455078125,
"step": 168
},
{
"completion_length": 273.70703125,
"epoch": 0.1966259453170448,
"grad_norm": 0.9798307739601261,
"kl": 0.08355712890625,
"learning_rate": 9.290308388703288e-07,
"loss": 0.0033,
"reward": 0.791015625,
"reward_std": 0.32959311455488205,
"rewards/correctness_reward_func": 0.328125,
"rewards/strict_format_reward_func": 0.462890625,
"step": 169
},
{
"completion_length": 306.48828125,
"epoch": 0.1977894124490983,
"grad_norm": 16.018378185855333,
"kl": 0.63409423828125,
"learning_rate": 9.280593900052014e-07,
"loss": 0.0254,
"reward": 0.822265625,
"reward_std": 0.4224512651562691,
"rewards/correctness_reward_func": 0.375,
"rewards/strict_format_reward_func": 0.447265625,
"step": 170
},
{
"completion_length": 307.9375,
"epoch": 0.19895287958115182,
"grad_norm": 8.264210864082758,
"kl": 0.29107666015625,
"learning_rate": 9.270818525987771e-07,
"loss": 0.0117,
"reward": 0.87890625,
"reward_std": 0.4745420068502426,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.43359375,
"step": 171
},
{
"completion_length": 273.25,
"epoch": 0.20011634671320536,
"grad_norm": 3200.4657340851295,
"kl": 2.255615234375,
"learning_rate": 9.260982405551476e-07,
"loss": 0.0899,
"reward": 0.892578125,
"reward_std": 0.49684761464595795,
"rewards/correctness_reward_func": 0.4375,
"rewards/strict_format_reward_func": 0.455078125,
"step": 172
},
{
"completion_length": 301.10546875,
"epoch": 0.20127981384525886,
"grad_norm": 0.4106069684572662,
"kl": 0.06402587890625,
"learning_rate": 9.251085678648071e-07,
"loss": 0.0026,
"reward": 0.775390625,
"reward_std": 0.3673408329486847,
"rewards/correctness_reward_func": 0.3359375,
"rewards/strict_format_reward_func": 0.439453125,
"step": 173
},
{
"completion_length": 306.25,
"epoch": 0.2024432809773124,
"grad_norm": 2.4526371721107645,
"kl": 0.0994873046875,
"learning_rate": 9.241128486044542e-07,
"loss": 0.004,
"reward": 0.94140625,
"reward_std": 0.5130884796380997,
"rewards/correctness_reward_func": 0.4921875,
"rewards/strict_format_reward_func": 0.44921875,
"step": 174
},
{
"completion_length": 294.6015625,
"epoch": 0.2036067481093659,
"grad_norm": 7.724202386944696,
"kl": 0.11065673828125,
"learning_rate": 9.231110969367918e-07,
"loss": 0.0044,
"reward": 0.8359375,
"reward_std": 0.4581919386982918,
"rewards/correctness_reward_func": 0.3984375,
"rewards/strict_format_reward_func": 0.4375,
"step": 175
},
{
"completion_length": 254.26953125,
"epoch": 0.20477021524141942,
"grad_norm": 4.0122059528505085,
"kl": 0.31842041015625,
"learning_rate": 9.221033271103249e-07,
"loss": 0.0127,
"reward": 0.92578125,
"reward_std": 0.4053529165685177,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.46484375,
"step": 176
},
{
"completion_length": 309.66796875,
"epoch": 0.20593368237347295,
"grad_norm": 2.06130360097307,
"kl": 0.1171875,
"learning_rate": 9.210895534591582e-07,
"loss": 0.0047,
"reward": 0.8984375,
"reward_std": 0.4733218848705292,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.453125,
"step": 177
},
{
"completion_length": 323.63671875,
"epoch": 0.20709714950552646,
"grad_norm": 5.004188106979899,
"kl": 0.0877685546875,
"learning_rate": 9.200697904027927e-07,
"loss": 0.0035,
"reward": 0.783203125,
"reward_std": 0.4214430972933769,
"rewards/correctness_reward_func": 0.34375,
"rewards/strict_format_reward_func": 0.439453125,
"step": 178
},
{
"completion_length": 305.921875,
"epoch": 0.20826061663758,
"grad_norm": 1.6668849296659807,
"kl": 0.0859375,
"learning_rate": 9.190440524459202e-07,
"loss": 0.0034,
"reward": 0.916015625,
"reward_std": 0.48984475433826447,
"rewards/correctness_reward_func": 0.46875,
"rewards/strict_format_reward_func": 0.447265625,
"step": 179
},
{
"completion_length": 246.859375,
"epoch": 0.2094240837696335,
"grad_norm": 3.1773447652134212,
"kl": 0.0968017578125,
"learning_rate": 9.18012354178217e-07,
"loss": 0.0039,
"reward": 1.044921875,
"reward_std": 0.49948541074991226,
"rewards/correctness_reward_func": 0.5859375,
"rewards/strict_format_reward_func": 0.458984375,
"step": 180
},
{
"completion_length": 270.046875,
"epoch": 0.21058755090168702,
"grad_norm": 12.240600235073,
"kl": 0.146240234375,
"learning_rate": 9.16974710274136e-07,
"loss": 0.0059,
"reward": 0.888671875,
"reward_std": 0.4256081059575081,
"rewards/correctness_reward_func": 0.4375,
"rewards/strict_format_reward_func": 0.451171875,
"step": 181
},
{
"completion_length": 277.4921875,
"epoch": 0.21175101803374055,
"grad_norm": 41.948078009277715,
"kl": 0.93603515625,
"learning_rate": 9.159311354926989e-07,
"loss": 0.0376,
"reward": 0.994140625,
"reward_std": 0.42895379662513733,
"rewards/correctness_reward_func": 0.53125,
"rewards/strict_format_reward_func": 0.462890625,
"step": 182
},
{
"completion_length": 270.48046875,
"epoch": 0.21291448516579406,
"grad_norm": 1.2973502608915635,
"kl": 0.06439208984375,
"learning_rate": 9.148816446772858e-07,
"loss": 0.0026,
"reward": 0.755859375,
"reward_std": 0.29502545297145844,
"rewards/correctness_reward_func": 0.3046875,
"rewards/strict_format_reward_func": 0.451171875,
"step": 183
},
{
"completion_length": 297.19921875,
"epoch": 0.2140779522978476,
"grad_norm": 2.500567548574192,
"kl": 0.09307861328125,
"learning_rate": 9.138262527554237e-07,
"loss": 0.0037,
"reward": 0.91796875,
"reward_std": 0.4057604745030403,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.45703125,
"step": 184
},
{
"completion_length": 226.44140625,
"epoch": 0.2152414194299011,
"grad_norm": 14.743103963897818,
"kl": 0.131103515625,
"learning_rate": 9.127649747385748e-07,
"loss": 0.0052,
"reward": 1.005859375,
"reward_std": 0.48278648406267166,
"rewards/correctness_reward_func": 0.5390625,
"rewards/strict_format_reward_func": 0.466796875,
"step": 185
},
{
"completion_length": 295.98046875,
"epoch": 0.2164048865619546,
"grad_norm": 2.599485686164368,
"kl": 0.10174560546875,
"learning_rate": 9.116978257219223e-07,
"loss": 0.0041,
"reward": 0.900390625,
"reward_std": 0.4165002331137657,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.455078125,
"step": 186
},
{
"completion_length": 273.453125,
"epoch": 0.21756835369400815,
"grad_norm": 1.9090617402205379,
"kl": 0.08551025390625,
"learning_rate": 9.106248208841568e-07,
"loss": 0.0034,
"reward": 0.875,
"reward_std": 0.3356376253068447,
"rewards/correctness_reward_func": 0.40625,
"rewards/strict_format_reward_func": 0.46875,
"step": 187
},
{
"completion_length": 262.65625,
"epoch": 0.21873182082606166,
"grad_norm": 144.5858579696907,
"kl": 1.45574951171875,
"learning_rate": 9.095459754872588e-07,
"loss": 0.0584,
"reward": 0.818359375,
"reward_std": 0.437059473246336,
"rewards/correctness_reward_func": 0.3515625,
"rewards/strict_format_reward_func": 0.466796875,
"step": 188
},
{
"completion_length": 292.56640625,
"epoch": 0.2198952879581152,
"grad_norm": 1.0988973248113927,
"kl": 0.07281494140625,
"learning_rate": 9.084613048762833e-07,
"loss": 0.0029,
"reward": 0.90625,
"reward_std": 0.5009823776781559,
"rewards/correctness_reward_func": 0.4609375,
"rewards/strict_format_reward_func": 0.4453125,
"step": 189
},
{
"completion_length": 297.6640625,
"epoch": 0.2210587550901687,
"grad_norm": 0.7765766278689193,
"kl": 0.0587158203125,
"learning_rate": 9.073708244791405e-07,
"loss": 0.0023,
"reward": 0.9609375,
"reward_std": 0.39268171787261963,
"rewards/correctness_reward_func": 0.4921875,
"rewards/strict_format_reward_func": 0.46875,
"step": 190
},
{
"completion_length": 259.953125,
"epoch": 0.2222222222222222,
"grad_norm": 0.7753362260289501,
"kl": 0.05169677734375,
"learning_rate": 9.062745498063764e-07,
"loss": 0.0021,
"reward": 0.775390625,
"reward_std": 0.2924235537648201,
"rewards/correctness_reward_func": 0.3125,
"rewards/strict_format_reward_func": 0.462890625,
"step": 191
},
{
"completion_length": 347.28515625,
"epoch": 0.22338568935427575,
"grad_norm": 1.6977885523114584,
"kl": 0.0623779296875,
"learning_rate": 9.051724964509526e-07,
"loss": 0.0025,
"reward": 0.88671875,
"reward_std": 0.3994247317314148,
"rewards/correctness_reward_func": 0.4296875,
"rewards/strict_format_reward_func": 0.45703125,
"step": 192
},
{
"completion_length": 265.41015625,
"epoch": 0.22454915648632925,
"grad_norm": 18.19050532587102,
"kl": 0.7923583984375,
"learning_rate": 9.040646800880242e-07,
"loss": 0.0317,
"reward": 0.892578125,
"reward_std": 0.5228888541460037,
"rewards/correctness_reward_func": 0.453125,
"rewards/strict_format_reward_func": 0.439453125,
"step": 193
},
{
"completion_length": 312.88671875,
"epoch": 0.2257126236183828,
"grad_norm": 0.8340243461295392,
"kl": 0.0703125,
"learning_rate": 9.029511164747175e-07,
"loss": 0.0028,
"reward": 0.83984375,
"reward_std": 0.3894122242927551,
"rewards/correctness_reward_func": 0.40625,
"rewards/strict_format_reward_func": 0.43359375,
"step": 194
},
{
"completion_length": 286.765625,
"epoch": 0.2268760907504363,
"grad_norm": 1.200161776433764,
"kl": 0.05950927734375,
"learning_rate": 9.018318214499041e-07,
"loss": 0.0024,
"reward": 0.8203125,
"reward_std": 0.3945379853248596,
"rewards/correctness_reward_func": 0.3671875,
"rewards/strict_format_reward_func": 0.453125,
"step": 195
},
{
"completion_length": 247.5234375,
"epoch": 0.2280395578824898,
"grad_norm": 0.3212206681274319,
"kl": 0.06475830078125,
"learning_rate": 9.007068109339783e-07,
"loss": 0.0026,
"reward": 0.94921875,
"reward_std": 0.41352738067507744,
"rewards/correctness_reward_func": 0.5,
"rewards/strict_format_reward_func": 0.44921875,
"step": 196
},
{
"completion_length": 242.36328125,
"epoch": 0.22920302501454334,
"grad_norm": 98.99732039672644,
"kl": 0.5511474609375,
"learning_rate": 8.995761009286282e-07,
"loss": 0.0221,
"reward": 0.99609375,
"reward_std": 0.35693345218896866,
"rewards/correctness_reward_func": 0.5546875,
"rewards/strict_format_reward_func": 0.44140625,
"step": 197
},
{
"completion_length": 251.578125,
"epoch": 0.23036649214659685,
"grad_norm": 1.7688888702316456,
"kl": 0.063720703125,
"learning_rate": 8.984397075166095e-07,
"loss": 0.0026,
"reward": 0.904296875,
"reward_std": 0.500565767288208,
"rewards/correctness_reward_func": 0.4453125,
"rewards/strict_format_reward_func": 0.458984375,
"step": 198
},
{
"completion_length": 289.9921875,
"epoch": 0.2315299592786504,
"grad_norm": 1.0097855862170513,
"kl": 0.0572509765625,
"learning_rate": 8.97297646861516e-07,
"loss": 0.0023,
"reward": 0.830078125,
"reward_std": 0.39669080078601837,
"rewards/correctness_reward_func": 0.3828125,
"rewards/strict_format_reward_func": 0.447265625,
"step": 199
},
{
"completion_length": 254.05859375,
"epoch": 0.2326934264107039,
"grad_norm": 1.2860517043327864,
"kl": 0.0604248046875,
"learning_rate": 8.96149935207551e-07,
"loss": 0.0024,
"reward": 0.96875,
"reward_std": 0.3961385078728199,
"rewards/correctness_reward_func": 0.5078125,
"rewards/strict_format_reward_func": 0.4609375,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 859,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}