Zery's picture
Upload Qwen2VL fine-tuned model
1509245 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 498,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 82.65625,
"epoch": 0.012048192771084338,
"grad_norm": 5.841508927710052,
"kl": 0.0,
"learning_rate": 9.97991967871486e-07,
"loss": 0.0,
"reward": 1.4489864706993103,
"reward_std": 0.8421240150928497,
"rewards/accuracy_reward": 0.8005490005016327,
"rewards/format_reward": 0.6484375,
"step": 1
},
{
"completion_length": 91.453125,
"epoch": 0.024096385542168676,
"grad_norm": 4.392637703815363,
"kl": 0.00279998779296875,
"learning_rate": 9.959839357429717e-07,
"loss": 0.0001,
"reward": 1.3076424598693848,
"reward_std": 0.8380775451660156,
"rewards/accuracy_reward": 0.6123300492763519,
"rewards/format_reward": 0.6953125,
"step": 2
},
{
"completion_length": 79.171875,
"epoch": 0.03614457831325301,
"grad_norm": 5.134937236220538,
"kl": 0.009063720703125,
"learning_rate": 9.93975903614458e-07,
"loss": 0.0004,
"reward": 1.650797963142395,
"reward_std": 0.8256142735481262,
"rewards/accuracy_reward": 0.8773605227470398,
"rewards/format_reward": 0.7734375,
"step": 3
},
{
"completion_length": 90.8671875,
"epoch": 0.04819277108433735,
"grad_norm": 4.181043208735878,
"kl": 0.0099029541015625,
"learning_rate": 9.919678714859437e-07,
"loss": 0.0004,
"reward": 1.4978268146514893,
"reward_std": 0.7668428122997284,
"rewards/accuracy_reward": 0.6618892848491669,
"rewards/format_reward": 0.8359375,
"step": 4
},
{
"completion_length": 83.15625,
"epoch": 0.060240963855421686,
"grad_norm": 4.623169300333461,
"kl": 0.028106689453125,
"learning_rate": 9.899598393574296e-07,
"loss": 0.0011,
"reward": 1.959537386894226,
"reward_std": 0.6147363781929016,
"rewards/accuracy_reward": 1.0532873272895813,
"rewards/format_reward": 0.90625,
"step": 5
},
{
"completion_length": 75.1484375,
"epoch": 0.07228915662650602,
"grad_norm": 5.568012410409197,
"kl": 0.03021240234375,
"learning_rate": 9.879518072289156e-07,
"loss": 0.0012,
"reward": 2.047786593437195,
"reward_std": 0.4053535610437393,
"rewards/accuracy_reward": 1.0946615934371948,
"rewards/format_reward": 0.953125,
"step": 6
},
{
"completion_length": 76.03125,
"epoch": 0.08433734939759036,
"grad_norm": 4.7579852016782045,
"kl": 0.033935546875,
"learning_rate": 9.859437751004016e-07,
"loss": 0.0014,
"reward": 2.1630080938339233,
"reward_std": 0.3877447098493576,
"rewards/accuracy_reward": 1.2333204746246338,
"rewards/format_reward": 0.9296875,
"step": 7
},
{
"completion_length": 71.546875,
"epoch": 0.0963855421686747,
"grad_norm": 9.256093312505593,
"kl": 0.244384765625,
"learning_rate": 9.839357429718876e-07,
"loss": 0.0097,
"reward": 2.015242576599121,
"reward_std": 0.4337102472782135,
"rewards/accuracy_reward": 1.054305076599121,
"rewards/format_reward": 0.9609375,
"step": 8
},
{
"completion_length": 72.1796875,
"epoch": 0.10843373493975904,
"grad_norm": 9.959610046323814,
"kl": 0.2841796875,
"learning_rate": 9.819277108433734e-07,
"loss": 0.0114,
"reward": 1.9989103078842163,
"reward_std": 0.38074547052383423,
"rewards/accuracy_reward": 1.0145351886749268,
"rewards/format_reward": 0.984375,
"step": 9
},
{
"completion_length": 67.0078125,
"epoch": 0.12048192771084337,
"grad_norm": 4.494217301954794,
"kl": 0.0677490234375,
"learning_rate": 9.799196787148593e-07,
"loss": 0.0027,
"reward": 2.208647847175598,
"reward_std": 0.20472895354032516,
"rewards/accuracy_reward": 1.2086476683616638,
"rewards/format_reward": 1.0,
"step": 10
},
{
"completion_length": 66.3125,
"epoch": 0.13253012048192772,
"grad_norm": 4.205085729740715,
"kl": 0.111083984375,
"learning_rate": 9.779116465863453e-07,
"loss": 0.0044,
"reward": 2.016738772392273,
"reward_std": 0.39626075327396393,
"rewards/accuracy_reward": 1.0323637425899506,
"rewards/format_reward": 0.984375,
"step": 11
},
{
"completion_length": 64.2265625,
"epoch": 0.14457831325301204,
"grad_norm": 5.285643902891126,
"kl": 0.0670166015625,
"learning_rate": 9.759036144578313e-07,
"loss": 0.0027,
"reward": 2.0809445977211,
"reward_std": 0.3285638391971588,
"rewards/accuracy_reward": 1.080944538116455,
"rewards/format_reward": 1.0,
"step": 12
},
{
"completion_length": 57.7265625,
"epoch": 0.1566265060240964,
"grad_norm": 5.332797970620105,
"kl": 0.07958984375,
"learning_rate": 9.738955823293173e-07,
"loss": 0.0032,
"reward": 2.1677627563476562,
"reward_std": 0.32235731184482574,
"rewards/accuracy_reward": 1.1677626371383667,
"rewards/format_reward": 1.0,
"step": 13
},
{
"completion_length": 62.765625,
"epoch": 0.1686746987951807,
"grad_norm": 7.594424067233083,
"kl": 0.086181640625,
"learning_rate": 9.718875502008033e-07,
"loss": 0.0034,
"reward": 2.287484049797058,
"reward_std": 0.2577601447701454,
"rewards/accuracy_reward": 1.3031091094017029,
"rewards/format_reward": 0.984375,
"step": 14
},
{
"completion_length": 61.28125,
"epoch": 0.18072289156626506,
"grad_norm": 6.602361615736723,
"kl": 0.087890625,
"learning_rate": 9.69879518072289e-07,
"loss": 0.0035,
"reward": 2.28032910823822,
"reward_std": 0.38463760912418365,
"rewards/accuracy_reward": 1.2881416082382202,
"rewards/format_reward": 0.9921875,
"step": 15
},
{
"completion_length": 63.6796875,
"epoch": 0.1927710843373494,
"grad_norm": 4.1986480450121135,
"kl": 0.078125,
"learning_rate": 9.67871485943775e-07,
"loss": 0.0031,
"reward": 2.1277613639831543,
"reward_std": 0.2963729351758957,
"rewards/accuracy_reward": 1.1433865427970886,
"rewards/format_reward": 0.984375,
"step": 16
},
{
"completion_length": 60.65625,
"epoch": 0.20481927710843373,
"grad_norm": 6.921299965436032,
"kl": 0.088134765625,
"learning_rate": 9.65863453815261e-07,
"loss": 0.0035,
"reward": 2.157727599143982,
"reward_std": 0.30868735909461975,
"rewards/accuracy_reward": 1.1733525395393372,
"rewards/format_reward": 0.984375,
"step": 17
},
{
"completion_length": 59.2265625,
"epoch": 0.21686746987951808,
"grad_norm": 4.904213548043611,
"kl": 0.07666015625,
"learning_rate": 9.63855421686747e-07,
"loss": 0.0031,
"reward": 2.24626088142395,
"reward_std": 0.22766248881816864,
"rewards/accuracy_reward": 1.2540735006332397,
"rewards/format_reward": 0.9921875,
"step": 18
},
{
"completion_length": 58.703125,
"epoch": 0.2289156626506024,
"grad_norm": 4.786279154756674,
"kl": 0.109619140625,
"learning_rate": 9.61847389558233e-07,
"loss": 0.0044,
"reward": 2.050855040550232,
"reward_std": 0.35161878168582916,
"rewards/accuracy_reward": 1.0586674511432648,
"rewards/format_reward": 0.9921875,
"step": 19
},
{
"completion_length": 58.109375,
"epoch": 0.24096385542168675,
"grad_norm": 4.05967579782597,
"kl": 0.08056640625,
"learning_rate": 9.598393574297187e-07,
"loss": 0.0032,
"reward": 2.20633327960968,
"reward_std": 0.3129453659057617,
"rewards/accuracy_reward": 1.2219581604003906,
"rewards/format_reward": 0.984375,
"step": 20
},
{
"completion_length": 57.71875,
"epoch": 0.25301204819277107,
"grad_norm": 5.8300935596675885,
"kl": 0.080078125,
"learning_rate": 9.57831325301205e-07,
"loss": 0.0032,
"reward": 2.417273759841919,
"reward_std": 0.28760989010334015,
"rewards/accuracy_reward": 1.4250862002372742,
"rewards/format_reward": 0.9921875,
"step": 21
},
{
"completion_length": 54.5859375,
"epoch": 0.26506024096385544,
"grad_norm": 7.535044861581114,
"kl": 0.106201171875,
"learning_rate": 9.558232931726907e-07,
"loss": 0.0042,
"reward": 2.2527129650115967,
"reward_std": 0.2951706647872925,
"rewards/accuracy_reward": 1.2683378458023071,
"rewards/format_reward": 0.984375,
"step": 22
},
{
"completion_length": 61.09375,
"epoch": 0.27710843373493976,
"grad_norm": 4.416172924233661,
"kl": 0.10009765625,
"learning_rate": 9.538152610441766e-07,
"loss": 0.004,
"reward": 2.1894314289093018,
"reward_std": 0.21257736533880234,
"rewards/accuracy_reward": 1.1894314289093018,
"rewards/format_reward": 1.0,
"step": 23
},
{
"completion_length": 54.9921875,
"epoch": 0.2891566265060241,
"grad_norm": 4.553446996976198,
"kl": 0.09814453125,
"learning_rate": 9.518072289156625e-07,
"loss": 0.0039,
"reward": 2.3037142753601074,
"reward_std": 0.3323938250541687,
"rewards/accuracy_reward": 1.3115268349647522,
"rewards/format_reward": 0.9921875,
"step": 24
},
{
"completion_length": 55.9921875,
"epoch": 0.30120481927710846,
"grad_norm": 8.671383785487564,
"kl": 0.120849609375,
"learning_rate": 9.497991967871486e-07,
"loss": 0.0048,
"reward": 2.239556074142456,
"reward_std": 0.3447880446910858,
"rewards/accuracy_reward": 1.2551808953285217,
"rewards/format_reward": 0.984375,
"step": 25
},
{
"completion_length": 58.7890625,
"epoch": 0.3132530120481928,
"grad_norm": 8.322624639517006,
"kl": 0.12353515625,
"learning_rate": 9.477911646586345e-07,
"loss": 0.0049,
"reward": 2.2209770679473877,
"reward_std": 0.3139883056282997,
"rewards/accuracy_reward": 1.2287896275520325,
"rewards/format_reward": 0.9921875,
"step": 26
},
{
"completion_length": 56.2421875,
"epoch": 0.3253012048192771,
"grad_norm": 20.55146941012377,
"kl": 0.130126953125,
"learning_rate": 9.457831325301205e-07,
"loss": 0.0052,
"reward": 2.344720959663391,
"reward_std": 0.25742725282907486,
"rewards/accuracy_reward": 1.3525334596633911,
"rewards/format_reward": 0.9921875,
"step": 27
},
{
"completion_length": 52.3671875,
"epoch": 0.3373493975903614,
"grad_norm": 4.550988243582887,
"kl": 0.12548828125,
"learning_rate": 9.437751004016063e-07,
"loss": 0.005,
"reward": 2.407941460609436,
"reward_std": 0.3139786869287491,
"rewards/accuracy_reward": 1.4313790798187256,
"rewards/format_reward": 0.9765625,
"step": 28
},
{
"completion_length": 53.328125,
"epoch": 0.3493975903614458,
"grad_norm": 5.133796660962732,
"kl": 0.1435546875,
"learning_rate": 9.417670682730924e-07,
"loss": 0.0057,
"reward": 2.3306795358657837,
"reward_std": 0.3039723336696625,
"rewards/accuracy_reward": 1.3463045954704285,
"rewards/format_reward": 0.984375,
"step": 29
},
{
"completion_length": 53.8125,
"epoch": 0.3614457831325301,
"grad_norm": 6.796717577260548,
"kl": 0.27880859375,
"learning_rate": 9.397590361445783e-07,
"loss": 0.0112,
"reward": 2.2834625244140625,
"reward_std": 0.3063512295484543,
"rewards/accuracy_reward": 1.2834625244140625,
"rewards/format_reward": 1.0,
"step": 30
},
{
"completion_length": 56.3203125,
"epoch": 0.37349397590361444,
"grad_norm": 4.3393989853337285,
"kl": 0.14794921875,
"learning_rate": 9.377510040160642e-07,
"loss": 0.0059,
"reward": 2.354575991630554,
"reward_std": 0.314766064286232,
"rewards/accuracy_reward": 1.3623886704444885,
"rewards/format_reward": 0.9921875,
"step": 31
},
{
"completion_length": 54.171875,
"epoch": 0.3855421686746988,
"grad_norm": 4.279946209704863,
"kl": 0.197265625,
"learning_rate": 9.357429718875502e-07,
"loss": 0.0079,
"reward": 2.1385136246681213,
"reward_std": 0.24586574733257294,
"rewards/accuracy_reward": 1.1463261544704437,
"rewards/format_reward": 0.9921875,
"step": 32
},
{
"completion_length": 51.4140625,
"epoch": 0.39759036144578314,
"grad_norm": 5.88762957444806,
"kl": 0.1630859375,
"learning_rate": 9.33734939759036e-07,
"loss": 0.0065,
"reward": 2.2907108068466187,
"reward_std": 0.25231631100177765,
"rewards/accuracy_reward": 1.2907109260559082,
"rewards/format_reward": 1.0,
"step": 33
},
{
"completion_length": 50.4609375,
"epoch": 0.40963855421686746,
"grad_norm": 5.469228934242547,
"kl": 0.16845703125,
"learning_rate": 9.317269076305221e-07,
"loss": 0.0067,
"reward": 2.2533600330352783,
"reward_std": 0.25808002054691315,
"rewards/accuracy_reward": 1.2611725330352783,
"rewards/format_reward": 0.9921875,
"step": 34
},
{
"completion_length": 47.84375,
"epoch": 0.42168674698795183,
"grad_norm": 5.412602747215773,
"kl": 0.177734375,
"learning_rate": 9.29718875502008e-07,
"loss": 0.0071,
"reward": 2.3132054805755615,
"reward_std": 0.2454073503613472,
"rewards/accuracy_reward": 1.3132054805755615,
"rewards/format_reward": 1.0,
"step": 35
},
{
"completion_length": 44.21875,
"epoch": 0.43373493975903615,
"grad_norm": 5.190368238545804,
"kl": 0.2275390625,
"learning_rate": 9.27710843373494e-07,
"loss": 0.0091,
"reward": 2.2854232788085938,
"reward_std": 0.29085223376750946,
"rewards/accuracy_reward": 1.293235719203949,
"rewards/format_reward": 0.9921875,
"step": 36
},
{
"completion_length": 48.71875,
"epoch": 0.4457831325301205,
"grad_norm": 4.780274291960778,
"kl": 0.20751953125,
"learning_rate": 9.257028112449798e-07,
"loss": 0.0083,
"reward": 2.246184825897217,
"reward_std": 0.31601477414369583,
"rewards/accuracy_reward": 1.261809766292572,
"rewards/format_reward": 0.984375,
"step": 37
},
{
"completion_length": 42.265625,
"epoch": 0.4578313253012048,
"grad_norm": 6.234590681750942,
"kl": 0.265625,
"learning_rate": 9.236947791164659e-07,
"loss": 0.0106,
"reward": 2.112604260444641,
"reward_std": 0.30199334025382996,
"rewards/accuracy_reward": 1.1126042604446411,
"rewards/format_reward": 1.0,
"step": 38
},
{
"completion_length": 45.1015625,
"epoch": 0.46987951807228917,
"grad_norm": 4.611394363412455,
"kl": 0.15576171875,
"learning_rate": 9.216867469879518e-07,
"loss": 0.0062,
"reward": 2.3590028285980225,
"reward_std": 0.2973439395427704,
"rewards/accuracy_reward": 1.3746278285980225,
"rewards/format_reward": 0.984375,
"step": 39
},
{
"completion_length": 45.3046875,
"epoch": 0.4819277108433735,
"grad_norm": 6.117578716606278,
"kl": 0.17626953125,
"learning_rate": 9.196787148594377e-07,
"loss": 0.0071,
"reward": 2.2271867990493774,
"reward_std": 0.22323830425739288,
"rewards/accuracy_reward": 1.234999418258667,
"rewards/format_reward": 0.9921875,
"step": 40
},
{
"completion_length": 41.9453125,
"epoch": 0.4939759036144578,
"grad_norm": 4.858430237306144,
"kl": 0.2236328125,
"learning_rate": 9.176706827309237e-07,
"loss": 0.0089,
"reward": 2.217424750328064,
"reward_std": 0.2663164809346199,
"rewards/accuracy_reward": 1.2252373099327087,
"rewards/format_reward": 0.9921875,
"step": 41
},
{
"completion_length": 41.0234375,
"epoch": 0.5060240963855421,
"grad_norm": 4.127212546225013,
"kl": 0.18212890625,
"learning_rate": 9.156626506024095e-07,
"loss": 0.0073,
"reward": 2.16755473613739,
"reward_std": 0.3387562334537506,
"rewards/accuracy_reward": 1.1753671169281006,
"rewards/format_reward": 0.9921875,
"step": 42
},
{
"completion_length": 42.6640625,
"epoch": 0.5180722891566265,
"grad_norm": 5.226665280180925,
"kl": 0.23193359375,
"learning_rate": 9.136546184738956e-07,
"loss": 0.0093,
"reward": 2.203770875930786,
"reward_std": 0.3409430831670761,
"rewards/accuracy_reward": 1.2350206971168518,
"rewards/format_reward": 0.96875,
"step": 43
},
{
"completion_length": 40.9609375,
"epoch": 0.5301204819277109,
"grad_norm": 4.308668359699942,
"kl": 0.134033203125,
"learning_rate": 9.116465863453815e-07,
"loss": 0.0054,
"reward": 2.2817225456237793,
"reward_std": 0.19574209302663803,
"rewards/accuracy_reward": 1.281722605228424,
"rewards/format_reward": 1.0,
"step": 44
},
{
"completion_length": 38.7734375,
"epoch": 0.5421686746987951,
"grad_norm": 6.033974360622575,
"kl": 0.13232421875,
"learning_rate": 9.096385542168675e-07,
"loss": 0.0053,
"reward": 2.2139052152633667,
"reward_std": 0.28486668318510056,
"rewards/accuracy_reward": 1.2451552748680115,
"rewards/format_reward": 0.96875,
"step": 45
},
{
"completion_length": 41.1484375,
"epoch": 0.5542168674698795,
"grad_norm": 5.314865555502224,
"kl": 0.11279296875,
"learning_rate": 9.076305220883533e-07,
"loss": 0.0045,
"reward": 2.4188212156295776,
"reward_std": 0.2556447684764862,
"rewards/accuracy_reward": 1.4266336560249329,
"rewards/format_reward": 0.9921875,
"step": 46
},
{
"completion_length": 42.7109375,
"epoch": 0.5662650602409639,
"grad_norm": 3.687080063413381,
"kl": 0.123046875,
"learning_rate": 9.056224899598393e-07,
"loss": 0.0049,
"reward": 2.2985291481018066,
"reward_std": 0.2858593165874481,
"rewards/accuracy_reward": 1.3063417077064514,
"rewards/format_reward": 0.9921875,
"step": 47
},
{
"completion_length": 46.859375,
"epoch": 0.5783132530120482,
"grad_norm": 4.277184476359137,
"kl": 0.20166015625,
"learning_rate": 9.036144578313253e-07,
"loss": 0.0081,
"reward": 2.1704814434051514,
"reward_std": 0.3619203567504883,
"rewards/accuracy_reward": 1.186106562614441,
"rewards/format_reward": 0.984375,
"step": 48
},
{
"completion_length": 45.21875,
"epoch": 0.5903614457831325,
"grad_norm": 3.7971557376020577,
"kl": 0.124267578125,
"learning_rate": 9.016064257028112e-07,
"loss": 0.005,
"reward": 2.1000068187713623,
"reward_std": 0.2924596816301346,
"rewards/accuracy_reward": 1.123444378376007,
"rewards/format_reward": 0.9765625,
"step": 49
},
{
"completion_length": 44.7734375,
"epoch": 0.6024096385542169,
"grad_norm": 4.458817172061971,
"kl": 0.111083984375,
"learning_rate": 8.995983935742972e-07,
"loss": 0.0044,
"reward": 2.2635247707366943,
"reward_std": 0.3522821515798569,
"rewards/accuracy_reward": 1.2869621515274048,
"rewards/format_reward": 0.9765625,
"step": 50
},
{
"completion_length": 51.5859375,
"epoch": 0.6144578313253012,
"grad_norm": 5.351600002967812,
"kl": 0.115234375,
"learning_rate": 8.97590361445783e-07,
"loss": 0.0046,
"reward": 2.321009397506714,
"reward_std": 0.23405297100543976,
"rewards/accuracy_reward": 1.3366344571113586,
"rewards/format_reward": 0.984375,
"step": 51
},
{
"completion_length": 50.421875,
"epoch": 0.6265060240963856,
"grad_norm": 4.213335817741083,
"kl": 0.1396484375,
"learning_rate": 8.955823293172691e-07,
"loss": 0.0056,
"reward": 2.3553450107574463,
"reward_std": 0.25443293899297714,
"rewards/accuracy_reward": 1.3944076299667358,
"rewards/format_reward": 0.9609375,
"step": 52
},
{
"completion_length": 60.6015625,
"epoch": 0.6385542168674698,
"grad_norm": 6.123689334744157,
"kl": 0.121337890625,
"learning_rate": 8.93574297188755e-07,
"loss": 0.0049,
"reward": 2.112071990966797,
"reward_std": 0.30149899423122406,
"rewards/accuracy_reward": 1.1433220505714417,
"rewards/format_reward": 0.96875,
"step": 53
},
{
"completion_length": 50.0703125,
"epoch": 0.6506024096385542,
"grad_norm": 4.396654754831157,
"kl": 0.1337890625,
"learning_rate": 8.915662650602409e-07,
"loss": 0.0053,
"reward": 2.233729839324951,
"reward_std": 0.23247240483760834,
"rewards/accuracy_reward": 1.2571672797203064,
"rewards/format_reward": 0.9765625,
"step": 54
},
{
"completion_length": 60.2890625,
"epoch": 0.6626506024096386,
"grad_norm": 7.03985835954293,
"kl": 0.10498046875,
"learning_rate": 8.895582329317268e-07,
"loss": 0.0042,
"reward": 2.196902871131897,
"reward_std": 0.2882121652364731,
"rewards/accuracy_reward": 1.2125278115272522,
"rewards/format_reward": 0.984375,
"step": 55
},
{
"completion_length": 50.640625,
"epoch": 0.6746987951807228,
"grad_norm": 4.86896494949543,
"kl": 0.12451171875,
"learning_rate": 8.875502008032128e-07,
"loss": 0.005,
"reward": 2.171112537384033,
"reward_std": 0.16461243480443954,
"rewards/accuracy_reward": 1.1867375373840332,
"rewards/format_reward": 0.984375,
"step": 56
},
{
"completion_length": 53.21875,
"epoch": 0.6867469879518072,
"grad_norm": 3.557538165261062,
"kl": 0.1240234375,
"learning_rate": 8.855421686746988e-07,
"loss": 0.005,
"reward": 2.2328275442123413,
"reward_std": 0.2752218544483185,
"rewards/accuracy_reward": 1.2406402230262756,
"rewards/format_reward": 0.9921875,
"step": 57
},
{
"completion_length": 47.8671875,
"epoch": 0.6987951807228916,
"grad_norm": 5.180162989820259,
"kl": 0.125,
"learning_rate": 8.835341365461847e-07,
"loss": 0.005,
"reward": 2.2453041076660156,
"reward_std": 0.315682128071785,
"rewards/accuracy_reward": 1.268741488456726,
"rewards/format_reward": 0.9765625,
"step": 58
},
{
"completion_length": 57.9765625,
"epoch": 0.7108433734939759,
"grad_norm": 3.899105782667564,
"kl": 0.10205078125,
"learning_rate": 8.815261044176707e-07,
"loss": 0.0041,
"reward": 2.284543514251709,
"reward_std": 0.25333235412836075,
"rewards/accuracy_reward": 1.292356252670288,
"rewards/format_reward": 0.9921875,
"step": 59
},
{
"completion_length": 46.5859375,
"epoch": 0.7228915662650602,
"grad_norm": 13.765129472909528,
"kl": 0.106201171875,
"learning_rate": 8.795180722891565e-07,
"loss": 0.0042,
"reward": 2.113099694252014,
"reward_std": 0.326066330075264,
"rewards/accuracy_reward": 1.1287246942520142,
"rewards/format_reward": 0.984375,
"step": 60
},
{
"completion_length": 46.375,
"epoch": 0.7349397590361446,
"grad_norm": 6.1270425433473,
"kl": 0.16357421875,
"learning_rate": 8.775100401606425e-07,
"loss": 0.0065,
"reward": 1.9968695640563965,
"reward_std": 0.34320104122161865,
"rewards/accuracy_reward": 1.0124945640563965,
"rewards/format_reward": 0.984375,
"step": 61
},
{
"completion_length": 53.09375,
"epoch": 0.7469879518072289,
"grad_norm": 4.3056291481606745,
"kl": 0.1513671875,
"learning_rate": 8.755020080321285e-07,
"loss": 0.0061,
"reward": 2.1780970096588135,
"reward_std": 0.2706674858927727,
"rewards/accuracy_reward": 1.2093469500541687,
"rewards/format_reward": 0.96875,
"step": 62
},
{
"completion_length": 55.9375,
"epoch": 0.7590361445783133,
"grad_norm": 3.2395174572422416,
"kl": 0.14501953125,
"learning_rate": 8.734939759036144e-07,
"loss": 0.0058,
"reward": 2.1430922746658325,
"reward_std": 0.24412654340267181,
"rewards/accuracy_reward": 1.1665297150611877,
"rewards/format_reward": 0.9765625,
"step": 63
},
{
"completion_length": 56.6328125,
"epoch": 0.7710843373493976,
"grad_norm": 4.190814109425291,
"kl": 0.11962890625,
"learning_rate": 8.714859437751003e-07,
"loss": 0.0048,
"reward": 2.1700193881988525,
"reward_std": 0.2942150831222534,
"rewards/accuracy_reward": 1.1934569478034973,
"rewards/format_reward": 0.9765625,
"step": 64
},
{
"completion_length": 64.3984375,
"epoch": 0.7831325301204819,
"grad_norm": 3.226137200230793,
"kl": 0.102783203125,
"learning_rate": 8.694779116465863e-07,
"loss": 0.0041,
"reward": 2.2898290157318115,
"reward_std": 0.2443845123052597,
"rewards/accuracy_reward": 1.3132665753364563,
"rewards/format_reward": 0.9765625,
"step": 65
},
{
"completion_length": 67.7109375,
"epoch": 0.7951807228915663,
"grad_norm": 3.9157620361816314,
"kl": 0.0927734375,
"learning_rate": 8.674698795180723e-07,
"loss": 0.0037,
"reward": 2.161790609359741,
"reward_std": 0.29590657353401184,
"rewards/accuracy_reward": 1.1696029901504517,
"rewards/format_reward": 0.9921875,
"step": 66
},
{
"completion_length": 74.3203125,
"epoch": 0.8072289156626506,
"grad_norm": 3.1212414712368375,
"kl": 0.082763671875,
"learning_rate": 8.654618473895582e-07,
"loss": 0.0033,
"reward": 2.215745210647583,
"reward_std": 0.2766411006450653,
"rewards/accuracy_reward": 1.2313700914382935,
"rewards/format_reward": 0.984375,
"step": 67
},
{
"completion_length": 74.0390625,
"epoch": 0.8192771084337349,
"grad_norm": 3.446969302283755,
"kl": 0.074951171875,
"learning_rate": 8.634538152610441e-07,
"loss": 0.003,
"reward": 2.1964612007141113,
"reward_std": 0.235237754881382,
"rewards/accuracy_reward": 1.2198986411094666,
"rewards/format_reward": 0.9765625,
"step": 68
},
{
"completion_length": 76.9375,
"epoch": 0.8313253012048193,
"grad_norm": 3.310962519125171,
"kl": 0.08154296875,
"learning_rate": 8.614457831325301e-07,
"loss": 0.0033,
"reward": 2.1269989013671875,
"reward_std": 0.2448011264204979,
"rewards/accuracy_reward": 1.1426239013671875,
"rewards/format_reward": 0.984375,
"step": 69
},
{
"completion_length": 71.3984375,
"epoch": 0.8433734939759037,
"grad_norm": 3.2998576155248966,
"kl": 0.0888671875,
"learning_rate": 8.59437751004016e-07,
"loss": 0.0036,
"reward": 2.2479825019836426,
"reward_std": 0.2886482775211334,
"rewards/accuracy_reward": 1.2636074423789978,
"rewards/format_reward": 0.984375,
"step": 70
},
{
"completion_length": 72.1484375,
"epoch": 0.8554216867469879,
"grad_norm": 7.668000907111886,
"kl": 0.07861328125,
"learning_rate": 8.57429718875502e-07,
"loss": 0.0031,
"reward": 2.2247371673583984,
"reward_std": 0.2391326129436493,
"rewards/accuracy_reward": 1.2637996673583984,
"rewards/format_reward": 0.9609375,
"step": 71
},
{
"completion_length": 77.7734375,
"epoch": 0.8674698795180723,
"grad_norm": 3.4104191137958013,
"kl": 0.068359375,
"learning_rate": 8.554216867469879e-07,
"loss": 0.0027,
"reward": 2.2031702995300293,
"reward_std": 0.21321924775838852,
"rewards/accuracy_reward": 1.210982859134674,
"rewards/format_reward": 0.9921875,
"step": 72
},
{
"completion_length": 76.5546875,
"epoch": 0.8795180722891566,
"grad_norm": 3.884229840630286,
"kl": 0.0947265625,
"learning_rate": 8.534136546184738e-07,
"loss": 0.0038,
"reward": 2.2307136058807373,
"reward_std": 0.2959597185254097,
"rewards/accuracy_reward": 1.2463387250900269,
"rewards/format_reward": 0.984375,
"step": 73
},
{
"completion_length": 73.7265625,
"epoch": 0.891566265060241,
"grad_norm": 7.2397255809983525,
"kl": 0.170654296875,
"learning_rate": 8.514056224899598e-07,
"loss": 0.0068,
"reward": 2.311343193054199,
"reward_std": 0.21377335488796234,
"rewards/accuracy_reward": 1.319155752658844,
"rewards/format_reward": 0.9921875,
"step": 74
},
{
"completion_length": 71.5859375,
"epoch": 0.9036144578313253,
"grad_norm": 3.397020763244455,
"kl": 0.073974609375,
"learning_rate": 8.493975903614458e-07,
"loss": 0.003,
"reward": 2.3479005098342896,
"reward_std": 0.2722414582967758,
"rewards/accuracy_reward": 1.3713379502296448,
"rewards/format_reward": 0.9765625,
"step": 75
},
{
"completion_length": 64.34375,
"epoch": 0.9156626506024096,
"grad_norm": 4.709358727325993,
"kl": 0.116455078125,
"learning_rate": 8.473895582329317e-07,
"loss": 0.0047,
"reward": 2.1038066148757935,
"reward_std": 0.3149692267179489,
"rewards/accuracy_reward": 1.158493995666504,
"rewards/format_reward": 0.9453125,
"step": 76
},
{
"completion_length": 69.390625,
"epoch": 0.927710843373494,
"grad_norm": 3.3768601117352923,
"kl": 0.11376953125,
"learning_rate": 8.453815261044176e-07,
"loss": 0.0046,
"reward": 2.02778023481369,
"reward_std": 0.3105141818523407,
"rewards/accuracy_reward": 1.074655294418335,
"rewards/format_reward": 0.953125,
"step": 77
},
{
"completion_length": 67.328125,
"epoch": 0.9397590361445783,
"grad_norm": 3.504578270706009,
"kl": 0.115234375,
"learning_rate": 8.433734939759036e-07,
"loss": 0.0046,
"reward": 2.194709539413452,
"reward_std": 0.27273692935705185,
"rewards/accuracy_reward": 1.2181469202041626,
"rewards/format_reward": 0.9765625,
"step": 78
},
{
"completion_length": 75.1640625,
"epoch": 0.9518072289156626,
"grad_norm": 4.043012399812061,
"kl": 0.123046875,
"learning_rate": 8.413654618473895e-07,
"loss": 0.0049,
"reward": 2.13509202003479,
"reward_std": 0.313528910279274,
"rewards/accuracy_reward": 1.18196702003479,
"rewards/format_reward": 0.953125,
"step": 79
},
{
"completion_length": 70.0234375,
"epoch": 0.963855421686747,
"grad_norm": 4.870660538899373,
"kl": 0.086181640625,
"learning_rate": 8.393574297188755e-07,
"loss": 0.0035,
"reward": 2.1953389644622803,
"reward_std": 0.26908765733242035,
"rewards/accuracy_reward": 1.2265888452529907,
"rewards/format_reward": 0.96875,
"step": 80
},
{
"completion_length": 80.859375,
"epoch": 0.9759036144578314,
"grad_norm": 3.8261245848047065,
"kl": 0.1015625,
"learning_rate": 8.373493975903614e-07,
"loss": 0.0041,
"reward": 2.0212653279304504,
"reward_std": 0.3835397958755493,
"rewards/accuracy_reward": 1.0915777683258057,
"rewards/format_reward": 0.9296875,
"step": 81
},
{
"completion_length": 74.046875,
"epoch": 0.9879518072289156,
"grad_norm": 4.0964460767880535,
"kl": 0.083984375,
"learning_rate": 8.353413654618474e-07,
"loss": 0.0034,
"reward": 2.2536615133285522,
"reward_std": 0.2658763527870178,
"rewards/accuracy_reward": 1.2770991325378418,
"rewards/format_reward": 0.9765625,
"step": 82
},
{
"completion_length": 74.58333587646484,
"epoch": 1.0,
"grad_norm": 2.9272571318373655,
"kl": 0.1044921875,
"learning_rate": 8.333333333333333e-07,
"loss": 0.004,
"reward": 2.1187774538993835,
"reward_std": 0.1469321921467781,
"rewards/accuracy_reward": 1.1187774240970612,
"rewards/format_reward": 1.0,
"step": 83
},
{
"completion_length": 67.5390625,
"epoch": 1.0120481927710843,
"grad_norm": 4.360041456699287,
"kl": 0.116455078125,
"learning_rate": 8.313253012048192e-07,
"loss": 0.0047,
"reward": 2.2748764753341675,
"reward_std": 0.30198951065540314,
"rewards/accuracy_reward": 1.2983139157295227,
"rewards/format_reward": 0.9765625,
"step": 84
},
{
"completion_length": 71.640625,
"epoch": 1.0240963855421688,
"grad_norm": 3.852904865115574,
"kl": 0.100341796875,
"learning_rate": 8.293172690763052e-07,
"loss": 0.004,
"reward": 2.22179639339447,
"reward_std": 0.2614322751760483,
"rewards/accuracy_reward": 1.2452340126037598,
"rewards/format_reward": 0.9765625,
"step": 85
},
{
"completion_length": 77.71875,
"epoch": 1.036144578313253,
"grad_norm": 4.570601093607917,
"kl": 0.086181640625,
"learning_rate": 8.273092369477911e-07,
"loss": 0.0034,
"reward": 2.3267804384231567,
"reward_std": 0.1871008574962616,
"rewards/accuracy_reward": 1.3424054384231567,
"rewards/format_reward": 0.984375,
"step": 86
},
{
"completion_length": 74.0703125,
"epoch": 1.0481927710843373,
"grad_norm": 4.387034223472388,
"kl": 0.09033203125,
"learning_rate": 8.253012048192771e-07,
"loss": 0.0036,
"reward": 2.280067205429077,
"reward_std": 0.2090277522802353,
"rewards/accuracy_reward": 1.2800670266151428,
"rewards/format_reward": 1.0,
"step": 87
},
{
"completion_length": 72.8828125,
"epoch": 1.0602409638554218,
"grad_norm": 3.640432077142004,
"kl": 0.097412109375,
"learning_rate": 8.23293172690763e-07,
"loss": 0.0039,
"reward": 2.2264442443847656,
"reward_std": 0.2877971976995468,
"rewards/accuracy_reward": 1.2576942443847656,
"rewards/format_reward": 0.96875,
"step": 88
},
{
"completion_length": 68.9765625,
"epoch": 1.072289156626506,
"grad_norm": 3.6617214501921755,
"kl": 0.10107421875,
"learning_rate": 8.21285140562249e-07,
"loss": 0.004,
"reward": 2.232625722885132,
"reward_std": 0.26599176973104477,
"rewards/accuracy_reward": 1.2482507824897766,
"rewards/format_reward": 0.984375,
"step": 89
},
{
"completion_length": 74.765625,
"epoch": 1.0843373493975903,
"grad_norm": 4.600311265578528,
"kl": 0.09130859375,
"learning_rate": 8.192771084337349e-07,
"loss": 0.0037,
"reward": 2.253629207611084,
"reward_std": 0.21175827831029892,
"rewards/accuracy_reward": 1.269254207611084,
"rewards/format_reward": 0.984375,
"step": 90
},
{
"completion_length": 76.59375,
"epoch": 1.0963855421686748,
"grad_norm": 4.145602929032845,
"kl": 0.087646484375,
"learning_rate": 8.172690763052207e-07,
"loss": 0.0035,
"reward": 2.2744953632354736,
"reward_std": 0.24358398467302322,
"rewards/accuracy_reward": 1.2901203632354736,
"rewards/format_reward": 0.984375,
"step": 91
},
{
"completion_length": 75.875,
"epoch": 1.108433734939759,
"grad_norm": 3.8292102418969853,
"kl": 0.10693359375,
"learning_rate": 8.152610441767068e-07,
"loss": 0.0043,
"reward": 2.4102468490600586,
"reward_std": 0.22168071568012238,
"rewards/accuracy_reward": 1.4180592894554138,
"rewards/format_reward": 0.9921875,
"step": 92
},
{
"completion_length": 73.5078125,
"epoch": 1.1204819277108433,
"grad_norm": 3.889694391559541,
"kl": 0.0859375,
"learning_rate": 8.132530120481927e-07,
"loss": 0.0034,
"reward": 2.19115674495697,
"reward_std": 0.191669300198555,
"rewards/accuracy_reward": 1.1989692449569702,
"rewards/format_reward": 0.9921875,
"step": 93
},
{
"completion_length": 74.359375,
"epoch": 1.1325301204819278,
"grad_norm": 13.572499915490392,
"kl": 0.115966796875,
"learning_rate": 8.112449799196787e-07,
"loss": 0.0046,
"reward": 2.3821544647216797,
"reward_std": 0.2079356163740158,
"rewards/accuracy_reward": 1.3899668455123901,
"rewards/format_reward": 0.9921875,
"step": 94
},
{
"completion_length": 70.875,
"epoch": 1.144578313253012,
"grad_norm": 3.96863603284974,
"kl": 0.096923828125,
"learning_rate": 8.092369477911646e-07,
"loss": 0.0039,
"reward": 2.301279664039612,
"reward_std": 0.17724627256393433,
"rewards/accuracy_reward": 1.309092104434967,
"rewards/format_reward": 0.9921875,
"step": 95
},
{
"completion_length": 69.3125,
"epoch": 1.1566265060240963,
"grad_norm": 3.4379001474745206,
"kl": 0.090087890625,
"learning_rate": 8.072289156626506e-07,
"loss": 0.0036,
"reward": 2.371612310409546,
"reward_std": 0.1584479957818985,
"rewards/accuracy_reward": 1.371612310409546,
"rewards/format_reward": 1.0,
"step": 96
},
{
"completion_length": 68.6171875,
"epoch": 1.1686746987951806,
"grad_norm": 4.586260816062996,
"kl": 0.09375,
"learning_rate": 8.052208835341365e-07,
"loss": 0.0037,
"reward": 2.4862219095230103,
"reward_std": 0.20000579208135605,
"rewards/accuracy_reward": 1.4862220287322998,
"rewards/format_reward": 1.0,
"step": 97
},
{
"completion_length": 70.015625,
"epoch": 1.180722891566265,
"grad_norm": 4.047101829945655,
"kl": 0.112060546875,
"learning_rate": 8.032128514056225e-07,
"loss": 0.0045,
"reward": 2.2514266967773438,
"reward_std": 0.22294947504997253,
"rewards/accuracy_reward": 1.2514267563819885,
"rewards/format_reward": 1.0,
"step": 98
},
{
"completion_length": 66.9140625,
"epoch": 1.1927710843373494,
"grad_norm": 5.444249065473958,
"kl": 0.088134765625,
"learning_rate": 8.012048192771084e-07,
"loss": 0.0035,
"reward": 2.333179473876953,
"reward_std": 0.1811930388212204,
"rewards/accuracy_reward": 1.3331794738769531,
"rewards/format_reward": 1.0,
"step": 99
},
{
"completion_length": 65.828125,
"epoch": 1.2048192771084336,
"grad_norm": 7.074570957060863,
"kl": 0.1064453125,
"learning_rate": 7.991967871485942e-07,
"loss": 0.0043,
"reward": 2.278498649597168,
"reward_std": 0.17714769393205643,
"rewards/accuracy_reward": 1.2863109111785889,
"rewards/format_reward": 0.9921875,
"step": 100
},
{
"completion_length": 62.6875,
"epoch": 1.216867469879518,
"grad_norm": 6.600402598086416,
"kl": 0.099609375,
"learning_rate": 7.971887550200803e-07,
"loss": 0.004,
"reward": 2.3798866271972656,
"reward_std": 0.1492375209927559,
"rewards/accuracy_reward": 1.3798866868019104,
"rewards/format_reward": 1.0,
"step": 101
},
{
"completion_length": 67.234375,
"epoch": 1.2289156626506024,
"grad_norm": 5.4322907915163645,
"kl": 0.0927734375,
"learning_rate": 7.951807228915662e-07,
"loss": 0.0037,
"reward": 2.295409917831421,
"reward_std": 0.26540718972682953,
"rewards/accuracy_reward": 1.311034917831421,
"rewards/format_reward": 0.984375,
"step": 102
},
{
"completion_length": 62.59375,
"epoch": 1.2409638554216866,
"grad_norm": 4.734234621294123,
"kl": 0.10986328125,
"learning_rate": 7.931726907630522e-07,
"loss": 0.0044,
"reward": 2.3131519556045532,
"reward_std": 0.2041746824979782,
"rewards/accuracy_reward": 1.3209643959999084,
"rewards/format_reward": 0.9921875,
"step": 103
},
{
"completion_length": 65.0078125,
"epoch": 1.2530120481927711,
"grad_norm": 11.27432402123553,
"kl": 0.094482421875,
"learning_rate": 7.911646586345381e-07,
"loss": 0.0038,
"reward": 2.423591375350952,
"reward_std": 0.17853456735610962,
"rewards/accuracy_reward": 1.4235913753509521,
"rewards/format_reward": 1.0,
"step": 104
},
{
"completion_length": 61.96875,
"epoch": 1.2650602409638554,
"grad_norm": 5.605209449566961,
"kl": 0.10595703125,
"learning_rate": 7.891566265060241e-07,
"loss": 0.0042,
"reward": 2.2498486042022705,
"reward_std": 0.2505866587162018,
"rewards/accuracy_reward": 1.2576610445976257,
"rewards/format_reward": 0.9921875,
"step": 105
},
{
"completion_length": 69.890625,
"epoch": 1.2771084337349397,
"grad_norm": 9.555144265496201,
"kl": 0.1015625,
"learning_rate": 7.8714859437751e-07,
"loss": 0.0041,
"reward": 2.153669834136963,
"reward_std": 0.2159716784954071,
"rewards/accuracy_reward": 1.161482334136963,
"rewards/format_reward": 0.9921875,
"step": 106
},
{
"completion_length": 63.5625,
"epoch": 1.2891566265060241,
"grad_norm": 4.205528221959235,
"kl": 0.100341796875,
"learning_rate": 7.851405622489959e-07,
"loss": 0.004,
"reward": 2.2599010467529297,
"reward_std": 0.22189538180828094,
"rewards/accuracy_reward": 1.2599008083343506,
"rewards/format_reward": 1.0,
"step": 107
},
{
"completion_length": 60.3359375,
"epoch": 1.3012048192771084,
"grad_norm": 4.549607105799596,
"kl": 0.13525390625,
"learning_rate": 7.831325301204819e-07,
"loss": 0.0054,
"reward": 2.2945663928985596,
"reward_std": 0.2269488275051117,
"rewards/accuracy_reward": 1.2945663928985596,
"rewards/format_reward": 1.0,
"step": 108
},
{
"completion_length": 63.9765625,
"epoch": 1.3132530120481927,
"grad_norm": 7.122658458301131,
"kl": 0.10400390625,
"learning_rate": 7.811244979919679e-07,
"loss": 0.0042,
"reward": 2.223813772201538,
"reward_std": 0.2691728472709656,
"rewards/accuracy_reward": 1.2316263318061829,
"rewards/format_reward": 0.9921875,
"step": 109
},
{
"completion_length": 64.0390625,
"epoch": 1.3253012048192772,
"grad_norm": 4.0970391288989285,
"kl": 0.102783203125,
"learning_rate": 7.791164658634538e-07,
"loss": 0.0041,
"reward": 2.402035713195801,
"reward_std": 0.2192593812942505,
"rewards/accuracy_reward": 1.409848153591156,
"rewards/format_reward": 0.9921875,
"step": 110
},
{
"completion_length": 61.984375,
"epoch": 1.3373493975903614,
"grad_norm": 5.00798288991921,
"kl": 0.100830078125,
"learning_rate": 7.771084337349397e-07,
"loss": 0.004,
"reward": 2.268544912338257,
"reward_std": 0.17878198623657227,
"rewards/accuracy_reward": 1.2685450315475464,
"rewards/format_reward": 1.0,
"step": 111
},
{
"completion_length": 58.296875,
"epoch": 1.3493975903614457,
"grad_norm": 4.283142882967245,
"kl": 0.10888671875,
"learning_rate": 7.751004016064257e-07,
"loss": 0.0044,
"reward": 2.373852849006653,
"reward_std": 0.17504306137561798,
"rewards/accuracy_reward": 1.3738529086112976,
"rewards/format_reward": 1.0,
"step": 112
},
{
"completion_length": 60.484375,
"epoch": 1.3614457831325302,
"grad_norm": 4.840347639337677,
"kl": 0.097412109375,
"learning_rate": 7.730923694779116e-07,
"loss": 0.0039,
"reward": 2.2944198846817017,
"reward_std": 0.2088237851858139,
"rewards/accuracy_reward": 1.2944198250770569,
"rewards/format_reward": 1.0,
"step": 113
},
{
"completion_length": 59.6328125,
"epoch": 1.3734939759036144,
"grad_norm": 3.441438097506757,
"kl": 0.095458984375,
"learning_rate": 7.710843373493975e-07,
"loss": 0.0038,
"reward": 2.2015284299850464,
"reward_std": 0.22288134694099426,
"rewards/accuracy_reward": 1.201528549194336,
"rewards/format_reward": 1.0,
"step": 114
},
{
"completion_length": 58.3203125,
"epoch": 1.3855421686746987,
"grad_norm": 5.2560716101244545,
"kl": 0.12890625,
"learning_rate": 7.690763052208835e-07,
"loss": 0.0052,
"reward": 2.395646095275879,
"reward_std": 0.21848639845848083,
"rewards/accuracy_reward": 1.3956461548805237,
"rewards/format_reward": 1.0,
"step": 115
},
{
"completion_length": 58.2734375,
"epoch": 1.3975903614457832,
"grad_norm": 5.450406858307557,
"kl": 0.1064453125,
"learning_rate": 7.670682730923694e-07,
"loss": 0.0043,
"reward": 2.4746010303497314,
"reward_std": 0.1482101045548916,
"rewards/accuracy_reward": 1.4746010303497314,
"rewards/format_reward": 1.0,
"step": 116
},
{
"completion_length": 57.65625,
"epoch": 1.4096385542168675,
"grad_norm": 4.642950561404122,
"kl": 0.124267578125,
"learning_rate": 7.650602409638554e-07,
"loss": 0.005,
"reward": 2.1899147033691406,
"reward_std": 0.2073155865073204,
"rewards/accuracy_reward": 1.1977271437644958,
"rewards/format_reward": 0.9921875,
"step": 117
},
{
"completion_length": 56.609375,
"epoch": 1.4216867469879517,
"grad_norm": 9.36763410057133,
"kl": 0.112548828125,
"learning_rate": 7.630522088353414e-07,
"loss": 0.0045,
"reward": 2.457427501678467,
"reward_std": 0.248141810297966,
"rewards/accuracy_reward": 1.4574276804924011,
"rewards/format_reward": 1.0,
"step": 118
},
{
"completion_length": 55.59375,
"epoch": 1.4337349397590362,
"grad_norm": 4.076025029890633,
"kl": 0.095947265625,
"learning_rate": 7.610441767068273e-07,
"loss": 0.0038,
"reward": 2.3175806999206543,
"reward_std": 0.21353702247142792,
"rewards/accuracy_reward": 1.3175806999206543,
"rewards/format_reward": 1.0,
"step": 119
},
{
"completion_length": 56.359375,
"epoch": 1.4457831325301205,
"grad_norm": 4.1118838634058905,
"kl": 0.10693359375,
"learning_rate": 7.590361445783132e-07,
"loss": 0.0043,
"reward": 2.306099772453308,
"reward_std": 0.2674330025911331,
"rewards/accuracy_reward": 1.3217247128486633,
"rewards/format_reward": 0.984375,
"step": 120
},
{
"completion_length": 56.765625,
"epoch": 1.4578313253012047,
"grad_norm": 4.370520474393478,
"kl": 0.10302734375,
"learning_rate": 7.570281124497991e-07,
"loss": 0.0041,
"reward": 2.1378331184387207,
"reward_std": 0.24683931469917297,
"rewards/accuracy_reward": 1.1378332376480103,
"rewards/format_reward": 1.0,
"step": 121
},
{
"completion_length": 61.4453125,
"epoch": 1.4698795180722892,
"grad_norm": 3.7827942646929427,
"kl": 0.120361328125,
"learning_rate": 7.550200803212851e-07,
"loss": 0.0048,
"reward": 2.1952574253082275,
"reward_std": 0.163675457239151,
"rewards/accuracy_reward": 1.1952574849128723,
"rewards/format_reward": 1.0,
"step": 122
},
{
"completion_length": 64.2734375,
"epoch": 1.4819277108433735,
"grad_norm": 3.7942059326042887,
"kl": 0.115478515625,
"learning_rate": 7.53012048192771e-07,
"loss": 0.0046,
"reward": 2.052876114845276,
"reward_std": 0.3279467225074768,
"rewards/accuracy_reward": 1.0606885850429535,
"rewards/format_reward": 0.9921875,
"step": 123
},
{
"completion_length": 61.7578125,
"epoch": 1.4939759036144578,
"grad_norm": 4.163145774578374,
"kl": 0.1083984375,
"learning_rate": 7.51004016064257e-07,
"loss": 0.0043,
"reward": 2.483773946762085,
"reward_std": 0.21236886084079742,
"rewards/accuracy_reward": 1.483773946762085,
"rewards/format_reward": 1.0,
"step": 124
},
{
"completion_length": 69.8359375,
"epoch": 1.5060240963855422,
"grad_norm": 8.540024207287942,
"kl": 0.122314453125,
"learning_rate": 7.489959839357429e-07,
"loss": 0.0049,
"reward": 2.207366466522217,
"reward_std": 0.22365009784698486,
"rewards/accuracy_reward": 1.2073664665222168,
"rewards/format_reward": 1.0,
"step": 125
},
{
"completion_length": 68.21875,
"epoch": 1.5180722891566265,
"grad_norm": 4.163585518888115,
"kl": 0.097412109375,
"learning_rate": 7.469879518072289e-07,
"loss": 0.0039,
"reward": 2.3682451248168945,
"reward_std": 0.17314215004444122,
"rewards/accuracy_reward": 1.3682451844215393,
"rewards/format_reward": 1.0,
"step": 126
},
{
"completion_length": 74.7734375,
"epoch": 1.5301204819277108,
"grad_norm": 5.7954755578535595,
"kl": 0.09912109375,
"learning_rate": 7.449799196787149e-07,
"loss": 0.004,
"reward": 2.3054428100585938,
"reward_std": 0.166117824614048,
"rewards/accuracy_reward": 1.313255250453949,
"rewards/format_reward": 0.9921875,
"step": 127
},
{
"completion_length": 77.3046875,
"epoch": 1.5421686746987953,
"grad_norm": 4.318669163836461,
"kl": 0.091796875,
"learning_rate": 7.429718875502008e-07,
"loss": 0.0037,
"reward": 2.1308990716934204,
"reward_std": 0.19852972030639648,
"rewards/accuracy_reward": 1.13089919090271,
"rewards/format_reward": 1.0,
"step": 128
},
{
"completion_length": 78.1015625,
"epoch": 1.5542168674698795,
"grad_norm": 4.096032296356097,
"kl": 0.102783203125,
"learning_rate": 7.409638554216867e-07,
"loss": 0.0041,
"reward": 2.445680260658264,
"reward_std": 0.1704091727733612,
"rewards/accuracy_reward": 1.4456802010536194,
"rewards/format_reward": 1.0,
"step": 129
},
{
"completion_length": 74.75,
"epoch": 1.5662650602409638,
"grad_norm": 4.47404453525868,
"kl": 0.100341796875,
"learning_rate": 7.389558232931726e-07,
"loss": 0.004,
"reward": 2.2448705434799194,
"reward_std": 0.21340852975845337,
"rewards/accuracy_reward": 1.2448704838752747,
"rewards/format_reward": 1.0,
"step": 130
},
{
"completion_length": 75.3671875,
"epoch": 1.5783132530120483,
"grad_norm": 23.135090346261265,
"kl": 1.1025390625,
"learning_rate": 7.369477911646586e-07,
"loss": 0.0444,
"reward": 2.368005871772766,
"reward_std": 0.24276328086853027,
"rewards/accuracy_reward": 1.3680058717727661,
"rewards/format_reward": 1.0,
"step": 131
},
{
"completion_length": 76.5234375,
"epoch": 1.5903614457831325,
"grad_norm": 3.560296625305877,
"kl": 0.14111328125,
"learning_rate": 7.349397590361446e-07,
"loss": 0.0056,
"reward": 2.3832234144210815,
"reward_std": 0.2271246314048767,
"rewards/accuracy_reward": 1.398848533630371,
"rewards/format_reward": 0.984375,
"step": 132
},
{
"completion_length": 78.515625,
"epoch": 1.6024096385542168,
"grad_norm": 4.271885997013165,
"kl": 0.103271484375,
"learning_rate": 7.329317269076305e-07,
"loss": 0.0041,
"reward": 2.11967396736145,
"reward_std": 0.21069814264774323,
"rewards/accuracy_reward": 1.119674026966095,
"rewards/format_reward": 1.0,
"step": 133
},
{
"completion_length": 81.2109375,
"epoch": 1.6144578313253013,
"grad_norm": 3.989749340172797,
"kl": 0.10009765625,
"learning_rate": 7.309236947791164e-07,
"loss": 0.004,
"reward": 2.2381746768951416,
"reward_std": 0.2712934762239456,
"rewards/accuracy_reward": 1.2537997961044312,
"rewards/format_reward": 0.984375,
"step": 134
},
{
"completion_length": 84.828125,
"epoch": 1.6265060240963856,
"grad_norm": 5.101727030105181,
"kl": 0.0927734375,
"learning_rate": 7.289156626506024e-07,
"loss": 0.0037,
"reward": 2.3006190061569214,
"reward_std": 0.2388201355934143,
"rewards/accuracy_reward": 1.3084314465522766,
"rewards/format_reward": 0.9921875,
"step": 135
},
{
"completion_length": 78.3984375,
"epoch": 1.6385542168674698,
"grad_norm": 7.945369222479043,
"kl": 0.109130859375,
"learning_rate": 7.269076305220884e-07,
"loss": 0.0044,
"reward": 2.187756061553955,
"reward_std": 0.22536994516849518,
"rewards/accuracy_reward": 1.2033808827400208,
"rewards/format_reward": 0.984375,
"step": 136
},
{
"completion_length": 83.0234375,
"epoch": 1.6506024096385543,
"grad_norm": 7.511759922163927,
"kl": 0.074462890625,
"learning_rate": 7.248995983935742e-07,
"loss": 0.003,
"reward": 2.299572706222534,
"reward_std": 0.22408785670995712,
"rewards/accuracy_reward": 1.3073852062225342,
"rewards/format_reward": 0.9921875,
"step": 137
},
{
"completion_length": 84.640625,
"epoch": 1.6626506024096386,
"grad_norm": 3.2982396535282623,
"kl": 0.0810546875,
"learning_rate": 7.228915662650602e-07,
"loss": 0.0032,
"reward": 2.3804391622543335,
"reward_std": 0.2060808688402176,
"rewards/accuracy_reward": 1.3804389834403992,
"rewards/format_reward": 1.0,
"step": 138
},
{
"completion_length": 87.8125,
"epoch": 1.6746987951807228,
"grad_norm": 8.41708008218346,
"kl": 0.0810546875,
"learning_rate": 7.208835341365461e-07,
"loss": 0.0032,
"reward": 2.2146860361099243,
"reward_std": 0.2540859431028366,
"rewards/accuracy_reward": 1.2146860361099243,
"rewards/format_reward": 1.0,
"step": 139
},
{
"completion_length": 86.140625,
"epoch": 1.6867469879518073,
"grad_norm": 3.5435273544538815,
"kl": 0.072998046875,
"learning_rate": 7.188755020080321e-07,
"loss": 0.0029,
"reward": 2.3307693004608154,
"reward_std": 0.20385809987783432,
"rewards/accuracy_reward": 1.3385818004608154,
"rewards/format_reward": 0.9921875,
"step": 140
},
{
"completion_length": 85.9375,
"epoch": 1.6987951807228916,
"grad_norm": 3.544683408089574,
"kl": 0.083984375,
"learning_rate": 7.168674698795181e-07,
"loss": 0.0034,
"reward": 2.2913438081741333,
"reward_std": 0.26863446831703186,
"rewards/accuracy_reward": 1.3069688081741333,
"rewards/format_reward": 0.984375,
"step": 141
},
{
"completion_length": 83.2578125,
"epoch": 1.7108433734939759,
"grad_norm": 4.741927242341381,
"kl": 0.12548828125,
"learning_rate": 7.14859437751004e-07,
"loss": 0.005,
"reward": 2.3960628509521484,
"reward_std": 0.2550785541534424,
"rewards/accuracy_reward": 1.3960627913475037,
"rewards/format_reward": 1.0,
"step": 142
},
{
"completion_length": 86.671875,
"epoch": 1.7228915662650603,
"grad_norm": 3.0874349711182494,
"kl": 0.07470703125,
"learning_rate": 7.128514056224899e-07,
"loss": 0.003,
"reward": 2.3813560009002686,
"reward_std": 0.25298502296209335,
"rewards/accuracy_reward": 1.381356120109558,
"rewards/format_reward": 1.0,
"step": 143
},
{
"completion_length": 80.40625,
"epoch": 1.7349397590361446,
"grad_norm": 9.215211678123678,
"kl": 0.085693359375,
"learning_rate": 7.108433734939758e-07,
"loss": 0.0034,
"reward": 2.3150322437286377,
"reward_std": 0.23231424391269684,
"rewards/accuracy_reward": 1.315032422542572,
"rewards/format_reward": 1.0,
"step": 144
},
{
"completion_length": 79.5859375,
"epoch": 1.7469879518072289,
"grad_norm": 3.3677362414264307,
"kl": 0.098876953125,
"learning_rate": 7.088353413654619e-07,
"loss": 0.0039,
"reward": 2.2901567220687866,
"reward_std": 0.21487458050251007,
"rewards/accuracy_reward": 1.2979693412780762,
"rewards/format_reward": 0.9921875,
"step": 145
},
{
"completion_length": 87.2734375,
"epoch": 1.7590361445783134,
"grad_norm": 3.8053306313986037,
"kl": 0.104736328125,
"learning_rate": 7.068273092369477e-07,
"loss": 0.0042,
"reward": 2.2074761390686035,
"reward_std": 0.24223129451274872,
"rewards/accuracy_reward": 1.2074760794639587,
"rewards/format_reward": 1.0,
"step": 146
},
{
"completion_length": 88.984375,
"epoch": 1.7710843373493976,
"grad_norm": 4.960937467624004,
"kl": 0.08251953125,
"learning_rate": 7.048192771084337e-07,
"loss": 0.0033,
"reward": 2.2357683181762695,
"reward_std": 0.2608248367905617,
"rewards/accuracy_reward": 1.2435806393623352,
"rewards/format_reward": 0.9921875,
"step": 147
},
{
"completion_length": 80.421875,
"epoch": 1.783132530120482,
"grad_norm": 3.5313461555382717,
"kl": 0.106689453125,
"learning_rate": 7.028112449799196e-07,
"loss": 0.0042,
"reward": 2.223365068435669,
"reward_std": 0.20793087780475616,
"rewards/accuracy_reward": 1.2311774492263794,
"rewards/format_reward": 0.9921875,
"step": 148
},
{
"completion_length": 81.6328125,
"epoch": 1.7951807228915664,
"grad_norm": 3.917968857756188,
"kl": 0.082763671875,
"learning_rate": 7.008032128514057e-07,
"loss": 0.0033,
"reward": 2.431049346923828,
"reward_std": 0.25210463255643845,
"rewards/accuracy_reward": 1.4310495257377625,
"rewards/format_reward": 1.0,
"step": 149
},
{
"completion_length": 82.71875,
"epoch": 1.8072289156626506,
"grad_norm": 3.2751640437820417,
"kl": 0.105224609375,
"learning_rate": 6.987951807228916e-07,
"loss": 0.0042,
"reward": 2.167607069015503,
"reward_std": 0.20023201406002045,
"rewards/accuracy_reward": 1.183232069015503,
"rewards/format_reward": 0.984375,
"step": 150
},
{
"completion_length": 80.1015625,
"epoch": 1.819277108433735,
"grad_norm": 3.696030829693263,
"kl": 0.09716796875,
"learning_rate": 6.967871485943774e-07,
"loss": 0.0039,
"reward": 2.545083999633789,
"reward_std": 0.17634352296590805,
"rewards/accuracy_reward": 1.5450841188430786,
"rewards/format_reward": 1.0,
"step": 151
},
{
"completion_length": 81.6484375,
"epoch": 1.8313253012048194,
"grad_norm": 5.419229696650584,
"kl": 0.119873046875,
"learning_rate": 6.947791164658634e-07,
"loss": 0.0048,
"reward": 2.144273281097412,
"reward_std": 0.2491978257894516,
"rewards/accuracy_reward": 1.152085781097412,
"rewards/format_reward": 0.9921875,
"step": 152
},
{
"completion_length": 77.96875,
"epoch": 1.8433734939759037,
"grad_norm": 34.81233821704641,
"kl": 0.09619140625,
"learning_rate": 6.927710843373493e-07,
"loss": 0.0039,
"reward": 2.4207249879837036,
"reward_std": 0.22066732123494148,
"rewards/accuracy_reward": 1.4207251071929932,
"rewards/format_reward": 1.0,
"step": 153
},
{
"completion_length": 81.3984375,
"epoch": 1.855421686746988,
"grad_norm": 4.095705367504911,
"kl": 0.101806640625,
"learning_rate": 6.907630522088354e-07,
"loss": 0.0041,
"reward": 2.160383105278015,
"reward_std": 0.27165083587169647,
"rewards/accuracy_reward": 1.1681956052780151,
"rewards/format_reward": 0.9921875,
"step": 154
},
{
"completion_length": 79.78125,
"epoch": 1.8674698795180724,
"grad_norm": 3.0440685644807663,
"kl": 0.11865234375,
"learning_rate": 6.887550200803212e-07,
"loss": 0.0047,
"reward": 2.4971319437026978,
"reward_std": 0.16808781027793884,
"rewards/accuracy_reward": 1.4971320629119873,
"rewards/format_reward": 1.0,
"step": 155
},
{
"completion_length": 83.09375,
"epoch": 1.8795180722891565,
"grad_norm": 3.1771226883841206,
"kl": 0.10498046875,
"learning_rate": 6.867469879518072e-07,
"loss": 0.0042,
"reward": 2.1450811624526978,
"reward_std": 0.2694619745016098,
"rewards/accuracy_reward": 1.1450812816619873,
"rewards/format_reward": 1.0,
"step": 156
},
{
"completion_length": 81.9453125,
"epoch": 1.891566265060241,
"grad_norm": 3.4230588560037583,
"kl": 0.113525390625,
"learning_rate": 6.847389558232931e-07,
"loss": 0.0045,
"reward": 2.44959032535553,
"reward_std": 0.16196198761463165,
"rewards/accuracy_reward": 1.4574028253555298,
"rewards/format_reward": 0.9921875,
"step": 157
},
{
"completion_length": 86.203125,
"epoch": 1.9036144578313254,
"grad_norm": 5.9344079114737,
"kl": 0.1015625,
"learning_rate": 6.827309236947792e-07,
"loss": 0.0041,
"reward": 2.1924350261688232,
"reward_std": 0.1869198903441429,
"rewards/accuracy_reward": 1.1924351453781128,
"rewards/format_reward": 1.0,
"step": 158
},
{
"completion_length": 84.7734375,
"epoch": 1.9156626506024095,
"grad_norm": 3.7338258911048707,
"kl": 0.105224609375,
"learning_rate": 6.807228915662651e-07,
"loss": 0.0042,
"reward": 2.298088550567627,
"reward_std": 0.2152806669473648,
"rewards/accuracy_reward": 1.3059011697769165,
"rewards/format_reward": 0.9921875,
"step": 159
},
{
"completion_length": 88.2109375,
"epoch": 1.927710843373494,
"grad_norm": 3.2737012532681535,
"kl": 0.124755859375,
"learning_rate": 6.787148594377509e-07,
"loss": 0.005,
"reward": 2.3695740699768066,
"reward_std": 0.300421878695488,
"rewards/accuracy_reward": 1.3930113911628723,
"rewards/format_reward": 0.9765625,
"step": 160
},
{
"completion_length": 82.9921875,
"epoch": 1.9397590361445785,
"grad_norm": 14.347253854862437,
"kl": 0.119873046875,
"learning_rate": 6.767068273092369e-07,
"loss": 0.0048,
"reward": 2.306626796722412,
"reward_std": 0.2548489645123482,
"rewards/accuracy_reward": 1.3222516179084778,
"rewards/format_reward": 0.984375,
"step": 161
},
{
"completion_length": 87.734375,
"epoch": 1.9518072289156625,
"grad_norm": 3.457686333163172,
"kl": 0.109375,
"learning_rate": 6.746987951807228e-07,
"loss": 0.0044,
"reward": 2.2328758239746094,
"reward_std": 0.28791245073080063,
"rewards/accuracy_reward": 1.2641257643699646,
"rewards/format_reward": 0.96875,
"step": 162
},
{
"completion_length": 83.25,
"epoch": 1.963855421686747,
"grad_norm": 4.1768305143971824,
"kl": 0.12353515625,
"learning_rate": 6.726907630522089e-07,
"loss": 0.0049,
"reward": 2.2161502838134766,
"reward_std": 0.25863420963287354,
"rewards/accuracy_reward": 1.2630252242088318,
"rewards/format_reward": 0.953125,
"step": 163
},
{
"completion_length": 88.734375,
"epoch": 1.9759036144578315,
"grad_norm": 4.842793088552531,
"kl": 0.105712890625,
"learning_rate": 6.706827309236947e-07,
"loss": 0.0042,
"reward": 2.090719521045685,
"reward_std": 0.25029148161411285,
"rewards/accuracy_reward": 1.1141569316387177,
"rewards/format_reward": 0.9765625,
"step": 164
},
{
"completion_length": 86.1953125,
"epoch": 1.9879518072289155,
"grad_norm": 3.657481472750154,
"kl": 0.125244140625,
"learning_rate": 6.686746987951807e-07,
"loss": 0.005,
"reward": 2.2765581607818604,
"reward_std": 0.2915503680706024,
"rewards/accuracy_reward": 1.30780827999115,
"rewards/format_reward": 0.96875,
"step": 165
},
{
"completion_length": 92.16666793823242,
"epoch": 2.0,
"grad_norm": 3.6057161188599776,
"kl": 0.125732421875,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0047,
"reward": 2.234604835510254,
"reward_std": 0.2570358142256737,
"rewards/accuracy_reward": 1.2346049845218658,
"rewards/format_reward": 1.0,
"step": 166
},
{
"completion_length": 87.1484375,
"epoch": 2.0120481927710845,
"grad_norm": 3.7603470456590564,
"kl": 0.094482421875,
"learning_rate": 6.646586345381526e-07,
"loss": 0.0038,
"reward": 2.2034374475479126,
"reward_std": 0.3387380540370941,
"rewards/accuracy_reward": 1.2112498879432678,
"rewards/format_reward": 0.9921875,
"step": 167
},
{
"completion_length": 86.1953125,
"epoch": 2.0240963855421685,
"grad_norm": 4.4381952945033465,
"kl": 0.09765625,
"learning_rate": 6.626506024096386e-07,
"loss": 0.0039,
"reward": 2.222957730293274,
"reward_std": 0.2284381240606308,
"rewards/accuracy_reward": 1.238582730293274,
"rewards/format_reward": 0.984375,
"step": 168
},
{
"completion_length": 84.3125,
"epoch": 2.036144578313253,
"grad_norm": 3.399081917667578,
"kl": 0.0966796875,
"learning_rate": 6.606425702811244e-07,
"loss": 0.0039,
"reward": 2.2074966430664062,
"reward_std": 0.2783028930425644,
"rewards/accuracy_reward": 1.2231215238571167,
"rewards/format_reward": 0.984375,
"step": 169
},
{
"completion_length": 84.1640625,
"epoch": 2.0481927710843375,
"grad_norm": 3.794821230336393,
"kl": 0.10400390625,
"learning_rate": 6.586345381526104e-07,
"loss": 0.0042,
"reward": 2.2774429321289062,
"reward_std": 0.18755661696195602,
"rewards/accuracy_reward": 1.2774428129196167,
"rewards/format_reward": 1.0,
"step": 170
},
{
"completion_length": 84.7421875,
"epoch": 2.0602409638554215,
"grad_norm": 5.41653478361753,
"kl": 0.09130859375,
"learning_rate": 6.566265060240963e-07,
"loss": 0.0036,
"reward": 2.2825827598571777,
"reward_std": 0.20142250508069992,
"rewards/accuracy_reward": 1.2825825810432434,
"rewards/format_reward": 1.0,
"step": 171
},
{
"completion_length": 78.421875,
"epoch": 2.072289156626506,
"grad_norm": 4.831319526617051,
"kl": 0.099365234375,
"learning_rate": 6.546184738955824e-07,
"loss": 0.004,
"reward": 2.4247552156448364,
"reward_std": 0.19953592866659164,
"rewards/accuracy_reward": 1.4247552752494812,
"rewards/format_reward": 1.0,
"step": 172
},
{
"completion_length": 78.359375,
"epoch": 2.0843373493975905,
"grad_norm": 3.8109915515963038,
"kl": 0.10498046875,
"learning_rate": 6.526104417670682e-07,
"loss": 0.0042,
"reward": 2.3325507640838623,
"reward_std": 0.26026056706905365,
"rewards/accuracy_reward": 1.348175823688507,
"rewards/format_reward": 0.984375,
"step": 173
},
{
"completion_length": 79.21875,
"epoch": 2.0963855421686746,
"grad_norm": 4.94758596751216,
"kl": 0.130615234375,
"learning_rate": 6.506024096385541e-07,
"loss": 0.0052,
"reward": 2.3614529371261597,
"reward_std": 0.23941361159086227,
"rewards/accuracy_reward": 1.3614528179168701,
"rewards/format_reward": 1.0,
"step": 174
},
{
"completion_length": 80.8984375,
"epoch": 2.108433734939759,
"grad_norm": 4.645980861130919,
"kl": 0.12646484375,
"learning_rate": 6.485943775100401e-07,
"loss": 0.0051,
"reward": 2.148719310760498,
"reward_std": 0.2538711354136467,
"rewards/accuracy_reward": 1.1487191915512085,
"rewards/format_reward": 1.0,
"step": 175
},
{
"completion_length": 78.921875,
"epoch": 2.1204819277108435,
"grad_norm": 3.362542245290514,
"kl": 0.090576171875,
"learning_rate": 6.465863453815261e-07,
"loss": 0.0036,
"reward": 2.3466458320617676,
"reward_std": 0.21008533239364624,
"rewards/accuracy_reward": 1.346645712852478,
"rewards/format_reward": 1.0,
"step": 176
},
{
"completion_length": 78.5546875,
"epoch": 2.1325301204819276,
"grad_norm": 3.6960106974538585,
"kl": 0.0908203125,
"learning_rate": 6.445783132530121e-07,
"loss": 0.0036,
"reward": 2.4223729372024536,
"reward_std": 0.15239863470196724,
"rewards/accuracy_reward": 1.4223730564117432,
"rewards/format_reward": 1.0,
"step": 177
},
{
"completion_length": 76.890625,
"epoch": 2.144578313253012,
"grad_norm": 3.5646239400027913,
"kl": 0.103515625,
"learning_rate": 6.425702811244979e-07,
"loss": 0.0041,
"reward": 2.4388126134872437,
"reward_std": 0.22842204570770264,
"rewards/accuracy_reward": 1.4466250538825989,
"rewards/format_reward": 0.9921875,
"step": 178
},
{
"completion_length": 78.796875,
"epoch": 2.1566265060240966,
"grad_norm": 3.531186908359453,
"kl": 0.099609375,
"learning_rate": 6.405622489959839e-07,
"loss": 0.004,
"reward": 2.1039586067199707,
"reward_std": 0.23404612392187119,
"rewards/accuracy_reward": 1.1273961663246155,
"rewards/format_reward": 0.9765625,
"step": 179
},
{
"completion_length": 75.75,
"epoch": 2.1686746987951806,
"grad_norm": 5.0096541073452485,
"kl": 0.1015625,
"learning_rate": 6.385542168674698e-07,
"loss": 0.0041,
"reward": 2.374882221221924,
"reward_std": 0.2003496214747429,
"rewards/accuracy_reward": 1.374882161617279,
"rewards/format_reward": 1.0,
"step": 180
},
{
"completion_length": 79.9375,
"epoch": 2.180722891566265,
"grad_norm": 3.929802835585037,
"kl": 0.102294921875,
"learning_rate": 6.365461847389559e-07,
"loss": 0.0041,
"reward": 2.4310786724090576,
"reward_std": 0.20660096406936646,
"rewards/accuracy_reward": 1.4310787916183472,
"rewards/format_reward": 1.0,
"step": 181
},
{
"completion_length": 80.7578125,
"epoch": 2.1927710843373496,
"grad_norm": 4.226674931816659,
"kl": 0.09619140625,
"learning_rate": 6.345381526104418e-07,
"loss": 0.0038,
"reward": 2.3952780961990356,
"reward_std": 0.2160111963748932,
"rewards/accuracy_reward": 1.3952780961990356,
"rewards/format_reward": 1.0,
"step": 182
},
{
"completion_length": 80.484375,
"epoch": 2.2048192771084336,
"grad_norm": 3.463553859166022,
"kl": 0.107421875,
"learning_rate": 6.325301204819276e-07,
"loss": 0.0043,
"reward": 2.3913345336914062,
"reward_std": 0.22311442345380783,
"rewards/accuracy_reward": 1.3991470336914062,
"rewards/format_reward": 0.9921875,
"step": 183
},
{
"completion_length": 78.484375,
"epoch": 2.216867469879518,
"grad_norm": 3.9553841913647356,
"kl": 0.08642578125,
"learning_rate": 6.305220883534136e-07,
"loss": 0.0035,
"reward": 2.353707432746887,
"reward_std": 0.2809625118970871,
"rewards/accuracy_reward": 1.3615199327468872,
"rewards/format_reward": 0.9921875,
"step": 184
},
{
"completion_length": 86.203125,
"epoch": 2.2289156626506026,
"grad_norm": 6.103835532514207,
"kl": 0.075439453125,
"learning_rate": 6.285140562248996e-07,
"loss": 0.003,
"reward": 2.411812663078308,
"reward_std": 0.17931858450174332,
"rewards/accuracy_reward": 1.411812663078308,
"rewards/format_reward": 1.0,
"step": 185
},
{
"completion_length": 77.515625,
"epoch": 2.2409638554216866,
"grad_norm": 3.91857543195832,
"kl": 0.10107421875,
"learning_rate": 6.265060240963856e-07,
"loss": 0.004,
"reward": 2.2299575805664062,
"reward_std": 0.2100789025425911,
"rewards/accuracy_reward": 1.2377700209617615,
"rewards/format_reward": 0.9921875,
"step": 186
},
{
"completion_length": 77.09375,
"epoch": 2.253012048192771,
"grad_norm": 3.8592654709883796,
"kl": 0.095947265625,
"learning_rate": 6.244979919678714e-07,
"loss": 0.0038,
"reward": 2.47510826587677,
"reward_std": 0.2556135207414627,
"rewards/accuracy_reward": 1.4829206466674805,
"rewards/format_reward": 0.9921875,
"step": 187
},
{
"completion_length": 79.2890625,
"epoch": 2.2650602409638556,
"grad_norm": 6.921774157099546,
"kl": 0.093017578125,
"learning_rate": 6.224899598393574e-07,
"loss": 0.0037,
"reward": 2.3394941091537476,
"reward_std": 0.23163118958473206,
"rewards/accuracy_reward": 1.3394939303398132,
"rewards/format_reward": 1.0,
"step": 188
},
{
"completion_length": 79.546875,
"epoch": 2.2771084337349397,
"grad_norm": 5.699992937395376,
"kl": 0.08544921875,
"learning_rate": 6.204819277108434e-07,
"loss": 0.0034,
"reward": 2.330021381378174,
"reward_std": 0.21045994758605957,
"rewards/accuracy_reward": 1.3300212621688843,
"rewards/format_reward": 1.0,
"step": 189
},
{
"completion_length": 77.421875,
"epoch": 2.289156626506024,
"grad_norm": 4.425700742489554,
"kl": 0.098388671875,
"learning_rate": 6.184738955823293e-07,
"loss": 0.0039,
"reward": 2.2294440269470215,
"reward_std": 0.21671444922685623,
"rewards/accuracy_reward": 1.2294440865516663,
"rewards/format_reward": 1.0,
"step": 190
},
{
"completion_length": 74.6640625,
"epoch": 2.3012048192771086,
"grad_norm": 3.5141288907091783,
"kl": 0.08154296875,
"learning_rate": 6.164658634538153e-07,
"loss": 0.0033,
"reward": 2.417364239692688,
"reward_std": 0.18784678727388382,
"rewards/accuracy_reward": 1.4173641800880432,
"rewards/format_reward": 1.0,
"step": 191
},
{
"completion_length": 74.53125,
"epoch": 2.3132530120481927,
"grad_norm": 4.6610918738389095,
"kl": 0.096435546875,
"learning_rate": 6.144578313253011e-07,
"loss": 0.0039,
"reward": 2.4048426151275635,
"reward_std": 0.2764005810022354,
"rewards/accuracy_reward": 1.412655234336853,
"rewards/format_reward": 0.9921875,
"step": 192
},
{
"completion_length": 80.8984375,
"epoch": 2.325301204819277,
"grad_norm": 6.933183617809393,
"kl": 0.07861328125,
"learning_rate": 6.124497991967871e-07,
"loss": 0.0031,
"reward": 2.2180745601654053,
"reward_std": 0.2127843052148819,
"rewards/accuracy_reward": 1.21807461977005,
"rewards/format_reward": 1.0,
"step": 193
},
{
"completion_length": 80.9296875,
"epoch": 2.337349397590361,
"grad_norm": 4.526116466506062,
"kl": 0.088623046875,
"learning_rate": 6.104417670682731e-07,
"loss": 0.0035,
"reward": 2.2327487468719482,
"reward_std": 0.2369586005806923,
"rewards/accuracy_reward": 1.240561306476593,
"rewards/format_reward": 0.9921875,
"step": 194
},
{
"completion_length": 79.8359375,
"epoch": 2.3493975903614457,
"grad_norm": 3.410370565415923,
"kl": 0.09326171875,
"learning_rate": 6.084337349397591e-07,
"loss": 0.0037,
"reward": 2.222264051437378,
"reward_std": 0.26303592324256897,
"rewards/accuracy_reward": 1.230076551437378,
"rewards/format_reward": 0.9921875,
"step": 195
},
{
"completion_length": 73.8828125,
"epoch": 2.36144578313253,
"grad_norm": 3.962197046428477,
"kl": 0.103271484375,
"learning_rate": 6.064257028112449e-07,
"loss": 0.0041,
"reward": 2.296523690223694,
"reward_std": 0.370675727725029,
"rewards/accuracy_reward": 1.2965235710144043,
"rewards/format_reward": 1.0,
"step": 196
},
{
"completion_length": 74.515625,
"epoch": 2.3734939759036147,
"grad_norm": 3.7849181083166066,
"kl": 0.100341796875,
"learning_rate": 6.044176706827308e-07,
"loss": 0.004,
"reward": 2.1898573637008667,
"reward_std": 0.2903239354491234,
"rewards/accuracy_reward": 1.1898574829101562,
"rewards/format_reward": 1.0,
"step": 197
},
{
"completion_length": 71.015625,
"epoch": 2.3855421686746987,
"grad_norm": 4.598411590922377,
"kl": 0.09716796875,
"learning_rate": 6.024096385542169e-07,
"loss": 0.0039,
"reward": 2.3405251502990723,
"reward_std": 0.1668776124715805,
"rewards/accuracy_reward": 1.3405250310897827,
"rewards/format_reward": 1.0,
"step": 198
},
{
"completion_length": 72.0234375,
"epoch": 2.397590361445783,
"grad_norm": 4.094960420612339,
"kl": 0.08447265625,
"learning_rate": 6.004016064257028e-07,
"loss": 0.0034,
"reward": 2.2692129611968994,
"reward_std": 0.22979120910167694,
"rewards/accuracy_reward": 1.2848379015922546,
"rewards/format_reward": 0.984375,
"step": 199
},
{
"completion_length": 76.34375,
"epoch": 2.4096385542168672,
"grad_norm": 5.228591551586785,
"kl": 0.0771484375,
"learning_rate": 5.983935742971888e-07,
"loss": 0.0031,
"reward": 2.29106342792511,
"reward_std": 0.22756240516901016,
"rewards/accuracy_reward": 1.2910634279251099,
"rewards/format_reward": 1.0,
"step": 200
},
{
"completion_length": 79.3828125,
"epoch": 2.4216867469879517,
"grad_norm": 3.532651567007306,
"kl": 0.140869140625,
"learning_rate": 5.963855421686746e-07,
"loss": 0.0056,
"reward": 2.218053698539734,
"reward_std": 0.24822543561458588,
"rewards/accuracy_reward": 1.2180536985397339,
"rewards/format_reward": 1.0,
"step": 201
},
{
"completion_length": 76.0,
"epoch": 2.433734939759036,
"grad_norm": 3.316768093202225,
"kl": 0.088134765625,
"learning_rate": 5.943775100401606e-07,
"loss": 0.0035,
"reward": 2.26613187789917,
"reward_std": 0.24750088155269623,
"rewards/accuracy_reward": 1.2739443182945251,
"rewards/format_reward": 0.9921875,
"step": 202
},
{
"completion_length": 70.5234375,
"epoch": 2.4457831325301207,
"grad_norm": 9.031966519770473,
"kl": 0.099853515625,
"learning_rate": 5.923694779116466e-07,
"loss": 0.004,
"reward": 2.317081928253174,
"reward_std": 0.24299181252717972,
"rewards/accuracy_reward": 1.3248944282531738,
"rewards/format_reward": 0.9921875,
"step": 203
},
{
"completion_length": 72.1484375,
"epoch": 2.4578313253012047,
"grad_norm": 4.923799185057533,
"kl": 0.09716796875,
"learning_rate": 5.903614457831325e-07,
"loss": 0.0039,
"reward": 2.202351689338684,
"reward_std": 0.24287213385105133,
"rewards/accuracy_reward": 1.2023517489433289,
"rewards/format_reward": 1.0,
"step": 204
},
{
"completion_length": 75.5390625,
"epoch": 2.4698795180722892,
"grad_norm": 10.424209527328602,
"kl": 0.0849609375,
"learning_rate": 5.883534136546184e-07,
"loss": 0.0034,
"reward": 2.3431246280670166,
"reward_std": 0.21441341936588287,
"rewards/accuracy_reward": 1.3431245684623718,
"rewards/format_reward": 1.0,
"step": 205
},
{
"completion_length": 74.1328125,
"epoch": 2.4819277108433733,
"grad_norm": 5.39794558294026,
"kl": 0.08349609375,
"learning_rate": 5.863453815261043e-07,
"loss": 0.0033,
"reward": 2.318004846572876,
"reward_std": 0.1649407297372818,
"rewards/accuracy_reward": 1.3180049657821655,
"rewards/format_reward": 1.0,
"step": 206
},
{
"completion_length": 70.828125,
"epoch": 2.4939759036144578,
"grad_norm": 5.651509118393077,
"kl": 0.099609375,
"learning_rate": 5.843373493975904e-07,
"loss": 0.004,
"reward": 2.2745083570480347,
"reward_std": 0.1795399785041809,
"rewards/accuracy_reward": 1.27450829744339,
"rewards/format_reward": 1.0,
"step": 207
},
{
"completion_length": 75.1484375,
"epoch": 2.5060240963855422,
"grad_norm": 3.374258945078158,
"kl": 0.099853515625,
"learning_rate": 5.823293172690763e-07,
"loss": 0.004,
"reward": 2.183190941810608,
"reward_std": 0.19665208458900452,
"rewards/accuracy_reward": 1.183190941810608,
"rewards/format_reward": 1.0,
"step": 208
},
{
"completion_length": 75.15625,
"epoch": 2.5180722891566267,
"grad_norm": 3.680961209255419,
"kl": 0.085693359375,
"learning_rate": 5.803212851405623e-07,
"loss": 0.0034,
"reward": 2.3783202171325684,
"reward_std": 0.21517369151115417,
"rewards/accuracy_reward": 1.3861328959465027,
"rewards/format_reward": 0.9921875,
"step": 209
},
{
"completion_length": 75.890625,
"epoch": 2.5301204819277108,
"grad_norm": 4.203577590596214,
"kl": 0.093017578125,
"learning_rate": 5.783132530120481e-07,
"loss": 0.0037,
"reward": 2.232303738594055,
"reward_std": 0.21822457760572433,
"rewards/accuracy_reward": 1.2401162385940552,
"rewards/format_reward": 0.9921875,
"step": 210
},
{
"completion_length": 72.5234375,
"epoch": 2.5421686746987953,
"grad_norm": 5.049709537985753,
"kl": 0.09033203125,
"learning_rate": 5.76305220883534e-07,
"loss": 0.0036,
"reward": 2.3138071298599243,
"reward_std": 0.18903522193431854,
"rewards/accuracy_reward": 1.3138071298599243,
"rewards/format_reward": 1.0,
"step": 211
},
{
"completion_length": 77.6796875,
"epoch": 2.5542168674698793,
"grad_norm": 4.79270453347689,
"kl": 0.10791015625,
"learning_rate": 5.742971887550201e-07,
"loss": 0.0043,
"reward": 2.35454523563385,
"reward_std": 0.260717436671257,
"rewards/accuracy_reward": 1.36235773563385,
"rewards/format_reward": 0.9921875,
"step": 212
},
{
"completion_length": 75.5234375,
"epoch": 2.566265060240964,
"grad_norm": 3.8110594359613694,
"kl": 0.132080078125,
"learning_rate": 5.72289156626506e-07,
"loss": 0.0053,
"reward": 2.3396618366241455,
"reward_std": 0.2776957154273987,
"rewards/accuracy_reward": 1.3474743366241455,
"rewards/format_reward": 0.9921875,
"step": 213
},
{
"completion_length": 78.8203125,
"epoch": 2.5783132530120483,
"grad_norm": 3.5277793226603467,
"kl": 0.082763671875,
"learning_rate": 5.70281124497992e-07,
"loss": 0.0033,
"reward": 2.282657027244568,
"reward_std": 0.20082392543554306,
"rewards/accuracy_reward": 1.2826570868492126,
"rewards/format_reward": 1.0,
"step": 214
},
{
"completion_length": 79.7265625,
"epoch": 2.5903614457831328,
"grad_norm": 5.661825173466666,
"kl": 0.070068359375,
"learning_rate": 5.682730923694778e-07,
"loss": 0.0028,
"reward": 2.2916386127471924,
"reward_std": 0.22843700647354126,
"rewards/accuracy_reward": 1.2916386723518372,
"rewards/format_reward": 1.0,
"step": 215
},
{
"completion_length": 75.484375,
"epoch": 2.602409638554217,
"grad_norm": 5.408656767411551,
"kl": 0.074951171875,
"learning_rate": 5.662650602409639e-07,
"loss": 0.003,
"reward": 2.4862678050994873,
"reward_std": 0.17430586367845535,
"rewards/accuracy_reward": 1.4862679243087769,
"rewards/format_reward": 1.0,
"step": 216
},
{
"completion_length": 75.4140625,
"epoch": 2.6144578313253013,
"grad_norm": 4.437169209890788,
"kl": 0.1123046875,
"learning_rate": 5.642570281124498e-07,
"loss": 0.0045,
"reward": 2.2881970405578613,
"reward_std": 0.24159938842058182,
"rewards/accuracy_reward": 1.3116344809532166,
"rewards/format_reward": 0.9765625,
"step": 217
},
{
"completion_length": 77.1484375,
"epoch": 2.6265060240963853,
"grad_norm": 3.7017405154535608,
"kl": 0.0849609375,
"learning_rate": 5.622489959839358e-07,
"loss": 0.0034,
"reward": 2.42057728767395,
"reward_std": 0.1918034851551056,
"rewards/accuracy_reward": 1.4205771684646606,
"rewards/format_reward": 1.0,
"step": 218
},
{
"completion_length": 74.9921875,
"epoch": 2.63855421686747,
"grad_norm": 3.0572748613034184,
"kl": 0.08056640625,
"learning_rate": 5.602409638554216e-07,
"loss": 0.0032,
"reward": 2.296902298927307,
"reward_std": 0.22776726633310318,
"rewards/accuracy_reward": 1.2969022989273071,
"rewards/format_reward": 1.0,
"step": 219
},
{
"completion_length": 77.9375,
"epoch": 2.6506024096385543,
"grad_norm": 5.142063259050984,
"kl": 0.08251953125,
"learning_rate": 5.582329317269075e-07,
"loss": 0.0033,
"reward": 2.411815643310547,
"reward_std": 0.20656804740428925,
"rewards/accuracy_reward": 1.4118155241012573,
"rewards/format_reward": 1.0,
"step": 220
},
{
"completion_length": 75.0625,
"epoch": 2.662650602409639,
"grad_norm": 9.244315362233946,
"kl": 0.094482421875,
"learning_rate": 5.562248995983936e-07,
"loss": 0.0038,
"reward": 2.2525359392166138,
"reward_std": 0.23683273047208786,
"rewards/accuracy_reward": 1.2681609392166138,
"rewards/format_reward": 0.984375,
"step": 221
},
{
"completion_length": 78.390625,
"epoch": 2.674698795180723,
"grad_norm": 4.89406748105177,
"kl": 0.078125,
"learning_rate": 5.542168674698795e-07,
"loss": 0.0031,
"reward": 2.33753764629364,
"reward_std": 0.21247170120477676,
"rewards/accuracy_reward": 1.3453501462936401,
"rewards/format_reward": 0.9921875,
"step": 222
},
{
"completion_length": 73.0859375,
"epoch": 2.6867469879518073,
"grad_norm": 3.6393688137680464,
"kl": 0.0810546875,
"learning_rate": 5.522088353413655e-07,
"loss": 0.0032,
"reward": 2.2808330059051514,
"reward_std": 0.1841505616903305,
"rewards/accuracy_reward": 1.280833125114441,
"rewards/format_reward": 1.0,
"step": 223
},
{
"completion_length": 77.1484375,
"epoch": 2.6987951807228914,
"grad_norm": 2.9614100491209516,
"kl": 0.08447265625,
"learning_rate": 5.502008032128513e-07,
"loss": 0.0034,
"reward": 2.256025791168213,
"reward_std": 0.22689195722341537,
"rewards/accuracy_reward": 1.271650791168213,
"rewards/format_reward": 0.984375,
"step": 224
},
{
"completion_length": 72.6015625,
"epoch": 2.710843373493976,
"grad_norm": 4.624802749562738,
"kl": 0.0810546875,
"learning_rate": 5.481927710843374e-07,
"loss": 0.0032,
"reward": 2.367666721343994,
"reward_std": 0.20605457574129105,
"rewards/accuracy_reward": 1.367666482925415,
"rewards/format_reward": 1.0,
"step": 225
},
{
"completion_length": 70.859375,
"epoch": 2.7228915662650603,
"grad_norm": 6.0943428059060505,
"kl": 0.10205078125,
"learning_rate": 5.461847389558233e-07,
"loss": 0.0041,
"reward": 2.3246583938598633,
"reward_std": 0.17254704982042313,
"rewards/accuracy_reward": 1.3324708938598633,
"rewards/format_reward": 0.9921875,
"step": 226
},
{
"completion_length": 75.640625,
"epoch": 2.734939759036145,
"grad_norm": 4.26546660385252,
"kl": 0.090087890625,
"learning_rate": 5.441767068273092e-07,
"loss": 0.0036,
"reward": 2.307809591293335,
"reward_std": 0.2002812698483467,
"rewards/accuracy_reward": 1.315622091293335,
"rewards/format_reward": 0.9921875,
"step": 227
},
{
"completion_length": 73.671875,
"epoch": 2.746987951807229,
"grad_norm": 3.4690497244218435,
"kl": 0.0927734375,
"learning_rate": 5.421686746987951e-07,
"loss": 0.0037,
"reward": 2.4064533710479736,
"reward_std": 0.1763758659362793,
"rewards/accuracy_reward": 1.4142658710479736,
"rewards/format_reward": 0.9921875,
"step": 228
},
{
"completion_length": 77.265625,
"epoch": 2.7590361445783134,
"grad_norm": 3.8015660942675313,
"kl": 0.107666015625,
"learning_rate": 5.401606425702811e-07,
"loss": 0.0043,
"reward": 2.417749524116516,
"reward_std": 0.20080577582120895,
"rewards/accuracy_reward": 1.4333745837211609,
"rewards/format_reward": 0.984375,
"step": 229
},
{
"completion_length": 78.6484375,
"epoch": 2.7710843373493974,
"grad_norm": 4.593078230781537,
"kl": 0.081298828125,
"learning_rate": 5.381526104417671e-07,
"loss": 0.0032,
"reward": 2.310904383659363,
"reward_std": 0.20601534098386765,
"rewards/accuracy_reward": 1.326529324054718,
"rewards/format_reward": 0.984375,
"step": 230
},
{
"completion_length": 69.75,
"epoch": 2.783132530120482,
"grad_norm": 4.781119598148597,
"kl": 0.092041015625,
"learning_rate": 5.36144578313253e-07,
"loss": 0.0037,
"reward": 2.4060455560684204,
"reward_std": 0.1945626586675644,
"rewards/accuracy_reward": 1.41385817527771,
"rewards/format_reward": 0.9921875,
"step": 231
},
{
"completion_length": 72.125,
"epoch": 2.7951807228915664,
"grad_norm": 3.6431689651666925,
"kl": 0.084716796875,
"learning_rate": 5.34136546184739e-07,
"loss": 0.0034,
"reward": 2.2687569856643677,
"reward_std": 0.20781449228525162,
"rewards/accuracy_reward": 1.2765693664550781,
"rewards/format_reward": 0.9921875,
"step": 232
},
{
"completion_length": 75.28125,
"epoch": 2.807228915662651,
"grad_norm": 3.463525581618983,
"kl": 0.0830078125,
"learning_rate": 5.321285140562248e-07,
"loss": 0.0033,
"reward": 2.2786985635757446,
"reward_std": 0.1869373545050621,
"rewards/accuracy_reward": 1.2865110039710999,
"rewards/format_reward": 0.9921875,
"step": 233
},
{
"completion_length": 72.390625,
"epoch": 2.819277108433735,
"grad_norm": 3.989550051539227,
"kl": 0.08935546875,
"learning_rate": 5.301204819277109e-07,
"loss": 0.0036,
"reward": 2.2122349739074707,
"reward_std": 0.17366793006658554,
"rewards/accuracy_reward": 1.212234914302826,
"rewards/format_reward": 1.0,
"step": 234
},
{
"completion_length": 68.4296875,
"epoch": 2.8313253012048194,
"grad_norm": 5.293732432179004,
"kl": 0.1162109375,
"learning_rate": 5.281124497991968e-07,
"loss": 0.0046,
"reward": 2.273004412651062,
"reward_std": 0.21551835536956787,
"rewards/accuracy_reward": 1.2730044722557068,
"rewards/format_reward": 1.0,
"step": 235
},
{
"completion_length": 70.4765625,
"epoch": 2.8433734939759034,
"grad_norm": 3.483964465031993,
"kl": 0.08642578125,
"learning_rate": 5.261044176706827e-07,
"loss": 0.0035,
"reward": 2.5097464323043823,
"reward_std": 0.21660751849412918,
"rewards/accuracy_reward": 1.509746491909027,
"rewards/format_reward": 1.0,
"step": 236
},
{
"completion_length": 67.1796875,
"epoch": 2.855421686746988,
"grad_norm": 3.2613871176315286,
"kl": 0.109619140625,
"learning_rate": 5.240963855421686e-07,
"loss": 0.0044,
"reward": 2.2154468297958374,
"reward_std": 0.2426525428891182,
"rewards/accuracy_reward": 1.2154468894004822,
"rewards/format_reward": 1.0,
"step": 237
},
{
"completion_length": 73.875,
"epoch": 2.8674698795180724,
"grad_norm": 5.04569953866162,
"kl": 0.105224609375,
"learning_rate": 5.220883534136546e-07,
"loss": 0.0042,
"reward": 2.3947439193725586,
"reward_std": 0.16551193594932556,
"rewards/accuracy_reward": 1.3947439193725586,
"rewards/format_reward": 1.0,
"step": 238
},
{
"completion_length": 70.03125,
"epoch": 2.8795180722891565,
"grad_norm": 3.2080049289623997,
"kl": 0.10986328125,
"learning_rate": 5.200803212851406e-07,
"loss": 0.0044,
"reward": 2.394848346710205,
"reward_std": 0.22504138201475143,
"rewards/accuracy_reward": 1.394848346710205,
"rewards/format_reward": 1.0,
"step": 239
},
{
"completion_length": 70.90625,
"epoch": 2.891566265060241,
"grad_norm": 3.843192487462901,
"kl": 0.1171875,
"learning_rate": 5.180722891566265e-07,
"loss": 0.0047,
"reward": 2.2219191789627075,
"reward_std": 0.2526251822710037,
"rewards/accuracy_reward": 1.2219191193580627,
"rewards/format_reward": 1.0,
"step": 240
},
{
"completion_length": 67.1328125,
"epoch": 2.9036144578313254,
"grad_norm": 3.0217979987505394,
"kl": 0.104248046875,
"learning_rate": 5.160642570281125e-07,
"loss": 0.0042,
"reward": 2.2357059717178345,
"reward_std": 0.181558758020401,
"rewards/accuracy_reward": 1.235705852508545,
"rewards/format_reward": 1.0,
"step": 241
},
{
"completion_length": 67.0390625,
"epoch": 2.9156626506024095,
"grad_norm": 4.171949473201647,
"kl": 0.1044921875,
"learning_rate": 5.140562248995983e-07,
"loss": 0.0042,
"reward": 2.3148874044418335,
"reward_std": 0.17748098075389862,
"rewards/accuracy_reward": 1.3148874640464783,
"rewards/format_reward": 1.0,
"step": 242
},
{
"completion_length": 65.8671875,
"epoch": 2.927710843373494,
"grad_norm": 8.908769866071971,
"kl": 0.11181640625,
"learning_rate": 5.120481927710843e-07,
"loss": 0.0045,
"reward": 2.2218422889709473,
"reward_std": 0.1961566060781479,
"rewards/accuracy_reward": 1.2296549081802368,
"rewards/format_reward": 0.9921875,
"step": 243
},
{
"completion_length": 63.6953125,
"epoch": 2.9397590361445785,
"grad_norm": 12.929344924116855,
"kl": 0.106201171875,
"learning_rate": 5.100401606425703e-07,
"loss": 0.0042,
"reward": 2.4831990003585815,
"reward_std": 0.17936265468597412,
"rewards/accuracy_reward": 1.4831989407539368,
"rewards/format_reward": 1.0,
"step": 244
},
{
"completion_length": 62.28125,
"epoch": 2.9518072289156625,
"grad_norm": 3.4705083145900404,
"kl": 0.111328125,
"learning_rate": 5.080321285140562e-07,
"loss": 0.0044,
"reward": 2.352734327316284,
"reward_std": 0.2174607664346695,
"rewards/accuracy_reward": 1.3683592081069946,
"rewards/format_reward": 0.984375,
"step": 245
},
{
"completion_length": 69.640625,
"epoch": 2.963855421686747,
"grad_norm": 4.178352503452598,
"kl": 0.111572265625,
"learning_rate": 5.060240963855421e-07,
"loss": 0.0045,
"reward": 2.3825145959854126,
"reward_std": 0.21491926908493042,
"rewards/accuracy_reward": 1.3903270959854126,
"rewards/format_reward": 0.9921875,
"step": 246
},
{
"completion_length": 65.875,
"epoch": 2.9759036144578315,
"grad_norm": 4.426857679190133,
"kl": 0.149169921875,
"learning_rate": 5.040160642570281e-07,
"loss": 0.006,
"reward": 2.1721856594085693,
"reward_std": 0.2390434294939041,
"rewards/accuracy_reward": 1.1721857190132141,
"rewards/format_reward": 1.0,
"step": 247
},
{
"completion_length": 70.9921875,
"epoch": 2.9879518072289155,
"grad_norm": 4.720913912936636,
"kl": 0.114013671875,
"learning_rate": 5.020080321285141e-07,
"loss": 0.0046,
"reward": 2.2051347494125366,
"reward_std": 0.2722553163766861,
"rewards/accuracy_reward": 1.2285721898078918,
"rewards/format_reward": 0.9765625,
"step": 248
},
{
"completion_length": 64.25000190734863,
"epoch": 3.0,
"grad_norm": 3.5181266600609904,
"kl": 0.11962890625,
"learning_rate": 5e-07,
"loss": 0.0048,
"reward": 2.1161320209503174,
"reward_std": 0.430472195148468,
"rewards/accuracy_reward": 1.1994653940200806,
"rewards/format_reward": 0.9166666865348816,
"step": 249
},
{
"completion_length": 68.1875,
"epoch": 3.0120481927710845,
"grad_norm": 3.5431810235066643,
"kl": 0.09619140625,
"learning_rate": 4.979919678714859e-07,
"loss": 0.0038,
"reward": 2.323817491531372,
"reward_std": 0.23299024999141693,
"rewards/accuracy_reward": 1.3316298723220825,
"rewards/format_reward": 0.9921875,
"step": 250
},
{
"completion_length": 71.6953125,
"epoch": 3.0240963855421685,
"grad_norm": 3.3542739826451173,
"kl": 0.08642578125,
"learning_rate": 4.959839357429718e-07,
"loss": 0.0035,
"reward": 2.411439895629883,
"reward_std": 0.19917739927768707,
"rewards/accuracy_reward": 1.4114398956298828,
"rewards/format_reward": 1.0,
"step": 251
},
{
"completion_length": 68.109375,
"epoch": 3.036144578313253,
"grad_norm": 12.151823073672764,
"kl": 0.110107421875,
"learning_rate": 4.939759036144578e-07,
"loss": 0.0044,
"reward": 2.5318474769592285,
"reward_std": 0.18056734651327133,
"rewards/accuracy_reward": 1.5396599173545837,
"rewards/format_reward": 0.9921875,
"step": 252
},
{
"completion_length": 72.578125,
"epoch": 3.0481927710843375,
"grad_norm": 3.219943316402962,
"kl": 0.099853515625,
"learning_rate": 4.919678714859438e-07,
"loss": 0.004,
"reward": 2.3200578689575195,
"reward_std": 0.15618911385536194,
"rewards/accuracy_reward": 1.3200578689575195,
"rewards/format_reward": 1.0,
"step": 253
},
{
"completion_length": 61.3828125,
"epoch": 3.0602409638554215,
"grad_norm": 3.865556225897638,
"kl": 0.10888671875,
"learning_rate": 4.899598393574297e-07,
"loss": 0.0044,
"reward": 2.209138035774231,
"reward_std": 0.17473262548446655,
"rewards/accuracy_reward": 1.2091379761695862,
"rewards/format_reward": 1.0,
"step": 254
},
{
"completion_length": 66.7421875,
"epoch": 3.072289156626506,
"grad_norm": 4.017362101946035,
"kl": 0.1259765625,
"learning_rate": 4.879518072289156e-07,
"loss": 0.005,
"reward": 2.139701724052429,
"reward_std": 0.22376088798046112,
"rewards/accuracy_reward": 1.1397016048431396,
"rewards/format_reward": 1.0,
"step": 255
},
{
"completion_length": 62.71875,
"epoch": 3.0843373493975905,
"grad_norm": 3.4288754746391947,
"kl": 0.140625,
"learning_rate": 4.859437751004016e-07,
"loss": 0.0056,
"reward": 2.2105259895324707,
"reward_std": 0.22984497249126434,
"rewards/accuracy_reward": 1.2261508703231812,
"rewards/format_reward": 0.984375,
"step": 256
},
{
"completion_length": 66.6953125,
"epoch": 3.0963855421686746,
"grad_norm": 3.481985490355864,
"kl": 0.1181640625,
"learning_rate": 4.839357429718875e-07,
"loss": 0.0047,
"reward": 2.5049203634262085,
"reward_std": 0.1857297122478485,
"rewards/accuracy_reward": 1.5049203634262085,
"rewards/format_reward": 1.0,
"step": 257
},
{
"completion_length": 67.484375,
"epoch": 3.108433734939759,
"grad_norm": 3.6977753194922403,
"kl": 0.107666015625,
"learning_rate": 4.819277108433735e-07,
"loss": 0.0043,
"reward": 2.3002774715423584,
"reward_std": 0.21863283962011337,
"rewards/accuracy_reward": 1.3080899119377136,
"rewards/format_reward": 0.9921875,
"step": 258
},
{
"completion_length": 71.984375,
"epoch": 3.1204819277108435,
"grad_norm": 3.2391554999759054,
"kl": 0.099853515625,
"learning_rate": 4.799196787148594e-07,
"loss": 0.004,
"reward": 2.404132843017578,
"reward_std": 0.19443362206220627,
"rewards/accuracy_reward": 1.4119452238082886,
"rewards/format_reward": 0.9921875,
"step": 259
},
{
"completion_length": 70.3984375,
"epoch": 3.1325301204819276,
"grad_norm": 3.8470897735347993,
"kl": 0.11181640625,
"learning_rate": 4.779116465863453e-07,
"loss": 0.0045,
"reward": 2.2314306497573853,
"reward_std": 0.1860732138156891,
"rewards/accuracy_reward": 1.2392430305480957,
"rewards/format_reward": 0.9921875,
"step": 260
},
{
"completion_length": 71.7109375,
"epoch": 3.144578313253012,
"grad_norm": 5.7256880192839965,
"kl": 0.101806640625,
"learning_rate": 4.7590361445783126e-07,
"loss": 0.0041,
"reward": 2.3397083282470703,
"reward_std": 0.21985551714897156,
"rewards/accuracy_reward": 1.3397083282470703,
"rewards/format_reward": 1.0,
"step": 261
},
{
"completion_length": 72.7265625,
"epoch": 3.1566265060240966,
"grad_norm": 4.6788843643036255,
"kl": 0.183837890625,
"learning_rate": 4.7389558232931724e-07,
"loss": 0.0074,
"reward": 2.288654088973999,
"reward_std": 0.25063957273960114,
"rewards/accuracy_reward": 1.296466588973999,
"rewards/format_reward": 0.9921875,
"step": 262
},
{
"completion_length": 66.96875,
"epoch": 3.1686746987951806,
"grad_norm": 4.000735227178484,
"kl": 0.1171875,
"learning_rate": 4.7188755020080317e-07,
"loss": 0.0047,
"reward": 2.385547637939453,
"reward_std": 0.179743941873312,
"rewards/accuracy_reward": 1.393360197544098,
"rewards/format_reward": 0.9921875,
"step": 263
},
{
"completion_length": 73.078125,
"epoch": 3.180722891566265,
"grad_norm": 3.2436175706744903,
"kl": 0.08837890625,
"learning_rate": 4.6987951807228915e-07,
"loss": 0.0035,
"reward": 2.3714927434921265,
"reward_std": 0.1866167113184929,
"rewards/accuracy_reward": 1.3793052434921265,
"rewards/format_reward": 0.9921875,
"step": 264
},
{
"completion_length": 67.7578125,
"epoch": 3.1927710843373496,
"grad_norm": 4.16773338040152,
"kl": 0.09619140625,
"learning_rate": 4.678714859437751e-07,
"loss": 0.0038,
"reward": 2.256360650062561,
"reward_std": 0.2188187688589096,
"rewards/accuracy_reward": 1.256360650062561,
"rewards/format_reward": 1.0,
"step": 265
},
{
"completion_length": 71.6796875,
"epoch": 3.2048192771084336,
"grad_norm": 3.7554898641141388,
"kl": 0.094482421875,
"learning_rate": 4.6586345381526106e-07,
"loss": 0.0038,
"reward": 2.285356283187866,
"reward_std": 0.2733229324221611,
"rewards/accuracy_reward": 1.2853562831878662,
"rewards/format_reward": 1.0,
"step": 266
},
{
"completion_length": 69.53125,
"epoch": 3.216867469879518,
"grad_norm": 3.1396081677261747,
"kl": 0.11572265625,
"learning_rate": 4.63855421686747e-07,
"loss": 0.0046,
"reward": 2.194140672683716,
"reward_std": 0.2116081416606903,
"rewards/accuracy_reward": 1.1941407322883606,
"rewards/format_reward": 1.0,
"step": 267
},
{
"completion_length": 67.8203125,
"epoch": 3.2289156626506026,
"grad_norm": 7.260439555595242,
"kl": 0.08837890625,
"learning_rate": 4.6184738955823296e-07,
"loss": 0.0035,
"reward": 2.252182364463806,
"reward_std": 0.1803755983710289,
"rewards/accuracy_reward": 1.259994924068451,
"rewards/format_reward": 0.9921875,
"step": 268
},
{
"completion_length": 67.390625,
"epoch": 3.2409638554216866,
"grad_norm": 3.5049860895757696,
"kl": 0.08935546875,
"learning_rate": 4.5983935742971884e-07,
"loss": 0.0036,
"reward": 2.2208237648010254,
"reward_std": 0.23105446994304657,
"rewards/accuracy_reward": 1.2286362648010254,
"rewards/format_reward": 0.9921875,
"step": 269
},
{
"completion_length": 70.8515625,
"epoch": 3.253012048192771,
"grad_norm": 5.489156591080696,
"kl": 0.131591796875,
"learning_rate": 4.5783132530120476e-07,
"loss": 0.0053,
"reward": 2.2373805046081543,
"reward_std": 0.2680865153670311,
"rewards/accuracy_reward": 1.2373805046081543,
"rewards/format_reward": 1.0,
"step": 270
},
{
"completion_length": 67.3359375,
"epoch": 3.2650602409638556,
"grad_norm": 3.943203757539833,
"kl": 0.102783203125,
"learning_rate": 4.5582329317269074e-07,
"loss": 0.0041,
"reward": 2.2856905460357666,
"reward_std": 0.2643607556819916,
"rewards/accuracy_reward": 1.2856906652450562,
"rewards/format_reward": 1.0,
"step": 271
},
{
"completion_length": 76.703125,
"epoch": 3.2771084337349397,
"grad_norm": 4.067837029288379,
"kl": 0.14794921875,
"learning_rate": 4.5381526104417667e-07,
"loss": 0.0059,
"reward": 2.2173361778259277,
"reward_std": 0.23457611352205276,
"rewards/accuracy_reward": 1.2251486778259277,
"rewards/format_reward": 0.9921875,
"step": 272
},
{
"completion_length": 70.9765625,
"epoch": 3.289156626506024,
"grad_norm": 3.356513487854019,
"kl": 0.105712890625,
"learning_rate": 4.5180722891566265e-07,
"loss": 0.0042,
"reward": 2.3274762630462646,
"reward_std": 0.1404755339026451,
"rewards/accuracy_reward": 1.327476143836975,
"rewards/format_reward": 1.0,
"step": 273
},
{
"completion_length": 73.5546875,
"epoch": 3.3012048192771086,
"grad_norm": 2.8662666869018194,
"kl": 0.087646484375,
"learning_rate": 4.497991967871486e-07,
"loss": 0.0035,
"reward": 2.4234249591827393,
"reward_std": 0.23345230519771576,
"rewards/accuracy_reward": 1.4234249591827393,
"rewards/format_reward": 1.0,
"step": 274
},
{
"completion_length": 76.2890625,
"epoch": 3.3132530120481927,
"grad_norm": 3.6359732134875027,
"kl": 0.0849609375,
"learning_rate": 4.4779116465863456e-07,
"loss": 0.0034,
"reward": 2.2799594402313232,
"reward_std": 0.17667143046855927,
"rewards/accuracy_reward": 1.2799595594406128,
"rewards/format_reward": 1.0,
"step": 275
},
{
"completion_length": 74.9296875,
"epoch": 3.325301204819277,
"grad_norm": 3.4769457078888513,
"kl": 0.1181640625,
"learning_rate": 4.4578313253012043e-07,
"loss": 0.0047,
"reward": 2.282673478126526,
"reward_std": 0.20452508330345154,
"rewards/accuracy_reward": 1.282673418521881,
"rewards/format_reward": 1.0,
"step": 276
},
{
"completion_length": 73.828125,
"epoch": 3.337349397590361,
"grad_norm": 5.230024279024117,
"kl": 0.0830078125,
"learning_rate": 4.437751004016064e-07,
"loss": 0.0033,
"reward": 2.2097089290618896,
"reward_std": 0.22180304676294327,
"rewards/accuracy_reward": 1.2097087502479553,
"rewards/format_reward": 1.0,
"step": 277
},
{
"completion_length": 72.7109375,
"epoch": 3.3493975903614457,
"grad_norm": 3.8728422379908416,
"kl": 0.095458984375,
"learning_rate": 4.4176706827309234e-07,
"loss": 0.0038,
"reward": 2.491241931915283,
"reward_std": 0.22739917039871216,
"rewards/accuracy_reward": 1.4912420511245728,
"rewards/format_reward": 1.0,
"step": 278
},
{
"completion_length": 78.5078125,
"epoch": 3.36144578313253,
"grad_norm": 3.6858021846036535,
"kl": 0.0908203125,
"learning_rate": 4.3975903614457827e-07,
"loss": 0.0036,
"reward": 2.243127226829529,
"reward_std": 0.22939348965883255,
"rewards/accuracy_reward": 1.2431272268295288,
"rewards/format_reward": 1.0,
"step": 279
},
{
"completion_length": 72.765625,
"epoch": 3.3734939759036147,
"grad_norm": 4.156042584491376,
"kl": 0.1044921875,
"learning_rate": 4.3775100401606425e-07,
"loss": 0.0042,
"reward": 2.2150485515594482,
"reward_std": 0.23025363683700562,
"rewards/accuracy_reward": 1.2228610515594482,
"rewards/format_reward": 0.9921875,
"step": 280
},
{
"completion_length": 77.0390625,
"epoch": 3.3855421686746987,
"grad_norm": 3.3549823921313475,
"kl": 0.100341796875,
"learning_rate": 4.3574297188755017e-07,
"loss": 0.004,
"reward": 2.211505889892578,
"reward_std": 0.24677567183971405,
"rewards/accuracy_reward": 1.227130949497223,
"rewards/format_reward": 0.984375,
"step": 281
},
{
"completion_length": 78.296875,
"epoch": 3.397590361445783,
"grad_norm": 3.5036767872389514,
"kl": 0.0859375,
"learning_rate": 4.3373493975903615e-07,
"loss": 0.0034,
"reward": 2.346588611602783,
"reward_std": 0.20112959295511246,
"rewards/accuracy_reward": 1.3465884923934937,
"rewards/format_reward": 1.0,
"step": 282
},
{
"completion_length": 84.484375,
"epoch": 3.4096385542168672,
"grad_norm": 3.0794227415803874,
"kl": 0.09326171875,
"learning_rate": 4.3172690763052203e-07,
"loss": 0.0037,
"reward": 2.230928421020508,
"reward_std": 0.26287955790758133,
"rewards/accuracy_reward": 1.2387409210205078,
"rewards/format_reward": 0.9921875,
"step": 283
},
{
"completion_length": 84.0546875,
"epoch": 3.4216867469879517,
"grad_norm": 9.632017573370238,
"kl": 0.086181640625,
"learning_rate": 4.29718875502008e-07,
"loss": 0.0034,
"reward": 2.2049087285995483,
"reward_std": 0.19046999514102936,
"rewards/accuracy_reward": 1.204908847808838,
"rewards/format_reward": 1.0,
"step": 284
},
{
"completion_length": 74.875,
"epoch": 3.433734939759036,
"grad_norm": 3.04437077789607,
"kl": 0.07861328125,
"learning_rate": 4.2771084337349393e-07,
"loss": 0.0031,
"reward": 2.3966974020004272,
"reward_std": 0.1937796175479889,
"rewards/accuracy_reward": 1.3966973423957825,
"rewards/format_reward": 1.0,
"step": 285
},
{
"completion_length": 75.8359375,
"epoch": 3.4457831325301207,
"grad_norm": 5.311045139915637,
"kl": 0.163330078125,
"learning_rate": 4.257028112449799e-07,
"loss": 0.0065,
"reward": 2.3752543926239014,
"reward_std": 0.2273067831993103,
"rewards/accuracy_reward": 1.3830668926239014,
"rewards/format_reward": 0.9921875,
"step": 286
},
{
"completion_length": 78.6328125,
"epoch": 3.4578313253012047,
"grad_norm": 3.0911678350526763,
"kl": 0.082763671875,
"learning_rate": 4.2369477911646584e-07,
"loss": 0.0033,
"reward": 2.3473113775253296,
"reward_std": 0.14994988590478897,
"rewards/accuracy_reward": 1.3473113775253296,
"rewards/format_reward": 1.0,
"step": 287
},
{
"completion_length": 79.1640625,
"epoch": 3.4698795180722892,
"grad_norm": 3.5847413181475947,
"kl": 0.0849609375,
"learning_rate": 4.216867469879518e-07,
"loss": 0.0034,
"reward": 2.433477997779846,
"reward_std": 0.1769290268421173,
"rewards/accuracy_reward": 1.4334778785705566,
"rewards/format_reward": 1.0,
"step": 288
},
{
"completion_length": 83.390625,
"epoch": 3.4819277108433733,
"grad_norm": 4.01569190307187,
"kl": 0.09521484375,
"learning_rate": 4.1967871485943775e-07,
"loss": 0.0038,
"reward": 2.2789034843444824,
"reward_std": 0.2845103293657303,
"rewards/accuracy_reward": 1.2867161631584167,
"rewards/format_reward": 0.9921875,
"step": 289
},
{
"completion_length": 81.90625,
"epoch": 3.4939759036144578,
"grad_norm": 3.286849126987869,
"kl": 0.08642578125,
"learning_rate": 4.176706827309237e-07,
"loss": 0.0035,
"reward": 2.362874150276184,
"reward_std": 0.19387810677289963,
"rewards/accuracy_reward": 1.362874150276184,
"rewards/format_reward": 1.0,
"step": 290
},
{
"completion_length": 82.6640625,
"epoch": 3.5060240963855422,
"grad_norm": 3.658103173473351,
"kl": 0.10888671875,
"learning_rate": 4.156626506024096e-07,
"loss": 0.0043,
"reward": 2.0810331106185913,
"reward_std": 0.3057002127170563,
"rewards/accuracy_reward": 1.088845670223236,
"rewards/format_reward": 0.9921875,
"step": 291
},
{
"completion_length": 78.921875,
"epoch": 3.5180722891566267,
"grad_norm": 3.7103596490236774,
"kl": 0.08349609375,
"learning_rate": 4.1365461847389553e-07,
"loss": 0.0033,
"reward": 2.511967420578003,
"reward_std": 0.16890805214643478,
"rewards/accuracy_reward": 1.5119673609733582,
"rewards/format_reward": 1.0,
"step": 292
},
{
"completion_length": 79.0703125,
"epoch": 3.5301204819277108,
"grad_norm": 4.407185593870522,
"kl": 0.099853515625,
"learning_rate": 4.116465863453815e-07,
"loss": 0.004,
"reward": 2.298495650291443,
"reward_std": 0.18783311545848846,
"rewards/accuracy_reward": 1.2984956502914429,
"rewards/format_reward": 1.0,
"step": 293
},
{
"completion_length": 77.796875,
"epoch": 3.5421686746987953,
"grad_norm": 4.826014110118868,
"kl": 0.09814453125,
"learning_rate": 4.0963855421686744e-07,
"loss": 0.0039,
"reward": 2.2871015071868896,
"reward_std": 0.2442024052143097,
"rewards/accuracy_reward": 1.2871016263961792,
"rewards/format_reward": 1.0,
"step": 294
},
{
"completion_length": 81.0390625,
"epoch": 3.5542168674698793,
"grad_norm": 5.044218587715949,
"kl": 0.1220703125,
"learning_rate": 4.076305220883534e-07,
"loss": 0.0049,
"reward": 2.3120492696762085,
"reward_std": 0.26864828169345856,
"rewards/accuracy_reward": 1.3198617696762085,
"rewards/format_reward": 0.9921875,
"step": 295
},
{
"completion_length": 81.8046875,
"epoch": 3.566265060240964,
"grad_norm": 4.035337217053536,
"kl": 0.102783203125,
"learning_rate": 4.0562248995983934e-07,
"loss": 0.0041,
"reward": 2.2244678735733032,
"reward_std": 0.19216852635145187,
"rewards/accuracy_reward": 1.2244678139686584,
"rewards/format_reward": 1.0,
"step": 296
},
{
"completion_length": 82.1875,
"epoch": 3.5783132530120483,
"grad_norm": 5.473424541297646,
"kl": 0.082275390625,
"learning_rate": 4.036144578313253e-07,
"loss": 0.0033,
"reward": 2.1482508182525635,
"reward_std": 0.2517557144165039,
"rewards/accuracy_reward": 1.1560633182525635,
"rewards/format_reward": 0.9921875,
"step": 297
},
{
"completion_length": 76.8828125,
"epoch": 3.5903614457831328,
"grad_norm": 3.624065660089473,
"kl": 0.099609375,
"learning_rate": 4.0160642570281125e-07,
"loss": 0.004,
"reward": 2.460606813430786,
"reward_std": 0.20688265562057495,
"rewards/accuracy_reward": 1.476231873035431,
"rewards/format_reward": 0.984375,
"step": 298
},
{
"completion_length": 73.8828125,
"epoch": 3.602409638554217,
"grad_norm": 3.2496622555871775,
"kl": 0.10302734375,
"learning_rate": 3.995983935742971e-07,
"loss": 0.0041,
"reward": 2.448202967643738,
"reward_std": 0.20513835549354553,
"rewards/accuracy_reward": 1.4482029676437378,
"rewards/format_reward": 1.0,
"step": 299
},
{
"completion_length": 73.8828125,
"epoch": 3.6144578313253013,
"grad_norm": 3.248403260656612,
"kl": 0.1142578125,
"learning_rate": 3.975903614457831e-07,
"loss": 0.0046,
"reward": 2.3579249382019043,
"reward_std": 0.26106585562229156,
"rewards/accuracy_reward": 1.3657374382019043,
"rewards/format_reward": 0.9921875,
"step": 300
},
{
"completion_length": 81.78125,
"epoch": 3.6265060240963853,
"grad_norm": 4.192951592702023,
"kl": 0.090087890625,
"learning_rate": 3.9558232931726903e-07,
"loss": 0.0036,
"reward": 2.320730686187744,
"reward_std": 0.17225497588515282,
"rewards/accuracy_reward": 1.3207308053970337,
"rewards/format_reward": 1.0,
"step": 301
},
{
"completion_length": 81.78125,
"epoch": 3.63855421686747,
"grad_norm": 3.914334064533718,
"kl": 0.082763671875,
"learning_rate": 3.93574297188755e-07,
"loss": 0.0033,
"reward": 2.2756303548812866,
"reward_std": 0.21440081298351288,
"rewards/accuracy_reward": 1.2834429144859314,
"rewards/format_reward": 0.9921875,
"step": 302
},
{
"completion_length": 83.984375,
"epoch": 3.6506024096385543,
"grad_norm": 2.9158995310046705,
"kl": 0.09326171875,
"learning_rate": 3.9156626506024094e-07,
"loss": 0.0037,
"reward": 2.340207576751709,
"reward_std": 0.22486132383346558,
"rewards/accuracy_reward": 1.3402075171470642,
"rewards/format_reward": 1.0,
"step": 303
},
{
"completion_length": 73.0078125,
"epoch": 3.662650602409639,
"grad_norm": 3.64523826351094,
"kl": 0.130615234375,
"learning_rate": 3.895582329317269e-07,
"loss": 0.0052,
"reward": 2.306045651435852,
"reward_std": 0.21042678505182266,
"rewards/accuracy_reward": 1.313858151435852,
"rewards/format_reward": 0.9921875,
"step": 304
},
{
"completion_length": 77.140625,
"epoch": 3.674698795180723,
"grad_norm": 4.763683185347457,
"kl": 0.09619140625,
"learning_rate": 3.8755020080321285e-07,
"loss": 0.0038,
"reward": 2.292635202407837,
"reward_std": 0.24200939387083054,
"rewards/accuracy_reward": 1.308260202407837,
"rewards/format_reward": 0.984375,
"step": 305
},
{
"completion_length": 80.6875,
"epoch": 3.6867469879518073,
"grad_norm": 15.378313149094321,
"kl": 0.130126953125,
"learning_rate": 3.8554216867469877e-07,
"loss": 0.0052,
"reward": 2.2641184329986572,
"reward_std": 0.20184506475925446,
"rewards/accuracy_reward": 1.2719308137893677,
"rewards/format_reward": 0.9921875,
"step": 306
},
{
"completion_length": 72.4453125,
"epoch": 3.6987951807228914,
"grad_norm": 6.1838290298686225,
"kl": 0.114501953125,
"learning_rate": 3.835341365461847e-07,
"loss": 0.0046,
"reward": 2.4186692237854004,
"reward_std": 0.20656991004943848,
"rewards/accuracy_reward": 1.4264817833900452,
"rewards/format_reward": 0.9921875,
"step": 307
},
{
"completion_length": 73.71875,
"epoch": 3.710843373493976,
"grad_norm": 3.6680281562358794,
"kl": 0.092041015625,
"learning_rate": 3.815261044176707e-07,
"loss": 0.0037,
"reward": 2.3598402738571167,
"reward_std": 0.1814076155424118,
"rewards/accuracy_reward": 1.3598402738571167,
"rewards/format_reward": 1.0,
"step": 308
},
{
"completion_length": 75.5625,
"epoch": 3.7228915662650603,
"grad_norm": 4.1513164017455635,
"kl": 0.11962890625,
"learning_rate": 3.795180722891566e-07,
"loss": 0.0048,
"reward": 2.2364041805267334,
"reward_std": 0.20799466967582703,
"rewards/accuracy_reward": 1.236404299736023,
"rewards/format_reward": 1.0,
"step": 309
},
{
"completion_length": 76.2109375,
"epoch": 3.734939759036145,
"grad_norm": 4.53835509987933,
"kl": 0.088623046875,
"learning_rate": 3.7751004016064253e-07,
"loss": 0.0036,
"reward": 2.3527251482009888,
"reward_std": 0.17692391574382782,
"rewards/accuracy_reward": 1.3527252078056335,
"rewards/format_reward": 1.0,
"step": 310
},
{
"completion_length": 80.4375,
"epoch": 3.746987951807229,
"grad_norm": 3.703393707261026,
"kl": 0.1103515625,
"learning_rate": 3.755020080321285e-07,
"loss": 0.0044,
"reward": 2.298377275466919,
"reward_std": 0.21109677106142044,
"rewards/accuracy_reward": 1.2983773350715637,
"rewards/format_reward": 1.0,
"step": 311
},
{
"completion_length": 77.8125,
"epoch": 3.7590361445783134,
"grad_norm": 3.914375784414754,
"kl": 0.138916015625,
"learning_rate": 3.7349397590361444e-07,
"loss": 0.0056,
"reward": 2.1520947217941284,
"reward_std": 0.19967754930257797,
"rewards/accuracy_reward": 1.1520947813987732,
"rewards/format_reward": 1.0,
"step": 312
},
{
"completion_length": 79.2578125,
"epoch": 3.7710843373493974,
"grad_norm": 5.606330092523797,
"kl": 0.091064453125,
"learning_rate": 3.714859437751004e-07,
"loss": 0.0036,
"reward": 2.3204472064971924,
"reward_std": 0.1748044565320015,
"rewards/accuracy_reward": 1.3204472661018372,
"rewards/format_reward": 1.0,
"step": 313
},
{
"completion_length": 74.84375,
"epoch": 3.783132530120482,
"grad_norm": 3.2348525038063736,
"kl": 0.08447265625,
"learning_rate": 3.694779116465863e-07,
"loss": 0.0034,
"reward": 2.496751070022583,
"reward_std": 0.2072158306837082,
"rewards/accuracy_reward": 1.496751070022583,
"rewards/format_reward": 1.0,
"step": 314
},
{
"completion_length": 74.296875,
"epoch": 3.7951807228915664,
"grad_norm": 3.7371491385040483,
"kl": 0.0771484375,
"learning_rate": 3.674698795180723e-07,
"loss": 0.0031,
"reward": 2.395453691482544,
"reward_std": 0.16877512633800507,
"rewards/accuracy_reward": 1.3954537510871887,
"rewards/format_reward": 1.0,
"step": 315
},
{
"completion_length": 72.8671875,
"epoch": 3.807228915662651,
"grad_norm": 5.799331345023467,
"kl": 0.09619140625,
"learning_rate": 3.654618473895582e-07,
"loss": 0.0039,
"reward": 2.307594895362854,
"reward_std": 0.1985296756029129,
"rewards/accuracy_reward": 1.307594895362854,
"rewards/format_reward": 1.0,
"step": 316
},
{
"completion_length": 72.84375,
"epoch": 3.819277108433735,
"grad_norm": 5.215215330938529,
"kl": 0.11083984375,
"learning_rate": 3.634538152610442e-07,
"loss": 0.0044,
"reward": 2.2713290452957153,
"reward_std": 0.15980049967765808,
"rewards/accuracy_reward": 1.2791414856910706,
"rewards/format_reward": 0.9921875,
"step": 317
},
{
"completion_length": 66.28125,
"epoch": 3.8313253012048194,
"grad_norm": 9.42828281313003,
"kl": 0.106201171875,
"learning_rate": 3.614457831325301e-07,
"loss": 0.0042,
"reward": 2.441011667251587,
"reward_std": 0.21370699256658554,
"rewards/accuracy_reward": 1.4566364884376526,
"rewards/format_reward": 0.984375,
"step": 318
},
{
"completion_length": 74.3359375,
"epoch": 3.8433734939759034,
"grad_norm": 3.380164477319568,
"kl": 0.094970703125,
"learning_rate": 3.5943775100401604e-07,
"loss": 0.0038,
"reward": 2.5070927143096924,
"reward_std": 0.16660126298666,
"rewards/accuracy_reward": 1.5149051547050476,
"rewards/format_reward": 0.9921875,
"step": 319
},
{
"completion_length": 71.3046875,
"epoch": 3.855421686746988,
"grad_norm": 4.006205885169367,
"kl": 0.128662109375,
"learning_rate": 3.57429718875502e-07,
"loss": 0.0051,
"reward": 2.3042829036712646,
"reward_std": 0.2031613141298294,
"rewards/accuracy_reward": 1.3042829036712646,
"rewards/format_reward": 1.0,
"step": 320
},
{
"completion_length": 73.9609375,
"epoch": 3.8674698795180724,
"grad_norm": 5.771036516275782,
"kl": 0.093017578125,
"learning_rate": 3.554216867469879e-07,
"loss": 0.0037,
"reward": 2.422416090965271,
"reward_std": 0.19139418005943298,
"rewards/accuracy_reward": 1.4302285313606262,
"rewards/format_reward": 0.9921875,
"step": 321
},
{
"completion_length": 71.734375,
"epoch": 3.8795180722891565,
"grad_norm": 5.860041479699707,
"kl": 0.110595703125,
"learning_rate": 3.5341365461847387e-07,
"loss": 0.0044,
"reward": 2.100473999977112,
"reward_std": 0.21565508097410202,
"rewards/accuracy_reward": 1.1004739999771118,
"rewards/format_reward": 1.0,
"step": 322
},
{
"completion_length": 69.046875,
"epoch": 3.891566265060241,
"grad_norm": 4.962719097630754,
"kl": 0.1396484375,
"learning_rate": 3.514056224899598e-07,
"loss": 0.0056,
"reward": 2.337049961090088,
"reward_std": 0.201468363404274,
"rewards/accuracy_reward": 1.337049961090088,
"rewards/format_reward": 1.0,
"step": 323
},
{
"completion_length": 70.0234375,
"epoch": 3.9036144578313254,
"grad_norm": 3.786778485554144,
"kl": 0.1064453125,
"learning_rate": 3.493975903614458e-07,
"loss": 0.0043,
"reward": 2.282514452934265,
"reward_std": 0.2470734864473343,
"rewards/accuracy_reward": 1.2903268933296204,
"rewards/format_reward": 0.9921875,
"step": 324
},
{
"completion_length": 66.5546875,
"epoch": 3.9156626506024095,
"grad_norm": 5.681847770854111,
"kl": 0.14599609375,
"learning_rate": 3.473895582329317e-07,
"loss": 0.0059,
"reward": 2.2830464839935303,
"reward_std": 0.16951018571853638,
"rewards/accuracy_reward": 1.2830466032028198,
"rewards/format_reward": 1.0,
"step": 325
},
{
"completion_length": 69.9765625,
"epoch": 3.927710843373494,
"grad_norm": 3.545177223680582,
"kl": 0.1123046875,
"learning_rate": 3.453815261044177e-07,
"loss": 0.0045,
"reward": 2.3249276876449585,
"reward_std": 0.23469389975070953,
"rewards/accuracy_reward": 1.3249276876449585,
"rewards/format_reward": 1.0,
"step": 326
},
{
"completion_length": 67.2109375,
"epoch": 3.9397590361445785,
"grad_norm": 4.464381426334607,
"kl": 0.111328125,
"learning_rate": 3.433734939759036e-07,
"loss": 0.0045,
"reward": 2.313346743583679,
"reward_std": 0.24960950016975403,
"rewards/accuracy_reward": 1.321159303188324,
"rewards/format_reward": 0.9921875,
"step": 327
},
{
"completion_length": 69.5390625,
"epoch": 3.9518072289156625,
"grad_norm": 5.503294892764904,
"kl": 0.13818359375,
"learning_rate": 3.413654618473896e-07,
"loss": 0.0055,
"reward": 2.250451922416687,
"reward_std": 0.19627484679222107,
"rewards/accuracy_reward": 1.2582644820213318,
"rewards/format_reward": 0.9921875,
"step": 328
},
{
"completion_length": 72.875,
"epoch": 3.963855421686747,
"grad_norm": 3.94333602961405,
"kl": 0.126953125,
"learning_rate": 3.3935742971887547e-07,
"loss": 0.0051,
"reward": 2.4282917976379395,
"reward_std": 0.23817364871501923,
"rewards/accuracy_reward": 1.4361043572425842,
"rewards/format_reward": 0.9921875,
"step": 329
},
{
"completion_length": 68.078125,
"epoch": 3.9759036144578315,
"grad_norm": 4.246221946155538,
"kl": 0.10302734375,
"learning_rate": 3.373493975903614e-07,
"loss": 0.0041,
"reward": 2.3756778240203857,
"reward_std": 0.23032685369253159,
"rewards/accuracy_reward": 1.3756778836250305,
"rewards/format_reward": 1.0,
"step": 330
},
{
"completion_length": 63.171875,
"epoch": 3.9879518072289155,
"grad_norm": 4.823180720092978,
"kl": 0.14111328125,
"learning_rate": 3.353413654618474e-07,
"loss": 0.0057,
"reward": 2.2716495990753174,
"reward_std": 0.25546562671661377,
"rewards/accuracy_reward": 1.2794621586799622,
"rewards/format_reward": 0.9921875,
"step": 331
},
{
"completion_length": 79.75000381469727,
"epoch": 4.0,
"grad_norm": 3.966089593429622,
"kl": 0.10986328125,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0047,
"reward": 1.9844202995300293,
"reward_std": 0.41577973030507565,
"rewards/accuracy_reward": 0.9844204187393188,
"rewards/format_reward": 1.0,
"step": 332
},
{
"completion_length": 67.8984375,
"epoch": 4.0120481927710845,
"grad_norm": 3.4890518846644203,
"kl": 0.112548828125,
"learning_rate": 3.313253012048193e-07,
"loss": 0.0045,
"reward": 2.273194432258606,
"reward_std": 0.1845482587814331,
"rewards/accuracy_reward": 1.2810069918632507,
"rewards/format_reward": 0.9921875,
"step": 333
},
{
"completion_length": 70.1328125,
"epoch": 4.024096385542169,
"grad_norm": 3.1401475074211698,
"kl": 0.106201171875,
"learning_rate": 3.293172690763052e-07,
"loss": 0.0042,
"reward": 2.348654627799988,
"reward_std": 0.20452319085597992,
"rewards/accuracy_reward": 1.3564670085906982,
"rewards/format_reward": 0.9921875,
"step": 334
},
{
"completion_length": 67.4296875,
"epoch": 4.036144578313253,
"grad_norm": 4.049959483426693,
"kl": 0.107177734375,
"learning_rate": 3.273092369477912e-07,
"loss": 0.0043,
"reward": 2.270454525947571,
"reward_std": 0.21142029762268066,
"rewards/accuracy_reward": 1.2704546451568604,
"rewards/format_reward": 1.0,
"step": 335
},
{
"completion_length": 71.1484375,
"epoch": 4.048192771084337,
"grad_norm": 3.9561612834766273,
"kl": 0.097412109375,
"learning_rate": 3.2530120481927706e-07,
"loss": 0.0039,
"reward": 2.1833893060684204,
"reward_std": 0.1801520176231861,
"rewards/accuracy_reward": 1.1912018656730652,
"rewards/format_reward": 0.9921875,
"step": 336
},
{
"completion_length": 69.59375,
"epoch": 4.0602409638554215,
"grad_norm": 3.977655100011985,
"kl": 0.1474609375,
"learning_rate": 3.2329317269076304e-07,
"loss": 0.0059,
"reward": 2.2047336101531982,
"reward_std": 0.1999206244945526,
"rewards/accuracy_reward": 1.204733669757843,
"rewards/format_reward": 1.0,
"step": 337
},
{
"completion_length": 61.4765625,
"epoch": 4.072289156626506,
"grad_norm": 4.191698428231115,
"kl": 0.12939453125,
"learning_rate": 3.2128514056224897e-07,
"loss": 0.0052,
"reward": 2.3498200178146362,
"reward_std": 0.2275300845503807,
"rewards/accuracy_reward": 1.3498198986053467,
"rewards/format_reward": 1.0,
"step": 338
},
{
"completion_length": 64.4140625,
"epoch": 4.0843373493975905,
"grad_norm": 3.9067810348739114,
"kl": 0.116943359375,
"learning_rate": 3.192771084337349e-07,
"loss": 0.0047,
"reward": 2.352308511734009,
"reward_std": 0.22002745419740677,
"rewards/accuracy_reward": 1.3523083925247192,
"rewards/format_reward": 1.0,
"step": 339
},
{
"completion_length": 73.2890625,
"epoch": 4.096385542168675,
"grad_norm": 4.489032904646898,
"kl": 0.104736328125,
"learning_rate": 3.172690763052209e-07,
"loss": 0.0042,
"reward": 2.1710336208343506,
"reward_std": 0.17718148604035378,
"rewards/accuracy_reward": 1.1710334420204163,
"rewards/format_reward": 1.0,
"step": 340
},
{
"completion_length": 74.3671875,
"epoch": 4.108433734939759,
"grad_norm": 4.230949730619595,
"kl": 0.139892578125,
"learning_rate": 3.152610441767068e-07,
"loss": 0.0056,
"reward": 2.084486246109009,
"reward_std": 0.2170683741569519,
"rewards/accuracy_reward": 1.0922988057136536,
"rewards/format_reward": 0.9921875,
"step": 341
},
{
"completion_length": 65.5625,
"epoch": 4.120481927710843,
"grad_norm": 5.461293103432774,
"kl": 0.1044921875,
"learning_rate": 3.132530120481928e-07,
"loss": 0.0042,
"reward": 2.381394147872925,
"reward_std": 0.193039670586586,
"rewards/accuracy_reward": 1.38139408826828,
"rewards/format_reward": 1.0,
"step": 342
},
{
"completion_length": 66.15625,
"epoch": 4.132530120481928,
"grad_norm": 4.070866693962467,
"kl": 0.111572265625,
"learning_rate": 3.112449799196787e-07,
"loss": 0.0045,
"reward": 2.357278347015381,
"reward_std": 0.15215902030467987,
"rewards/accuracy_reward": 1.3729035258293152,
"rewards/format_reward": 0.984375,
"step": 343
},
{
"completion_length": 69.1328125,
"epoch": 4.144578313253012,
"grad_norm": 4.335873726549927,
"kl": 0.123046875,
"learning_rate": 3.0923694779116464e-07,
"loss": 0.0049,
"reward": 2.282222032546997,
"reward_std": 0.25280918926000595,
"rewards/accuracy_reward": 1.2978470921516418,
"rewards/format_reward": 0.984375,
"step": 344
},
{
"completion_length": 73.6015625,
"epoch": 4.156626506024097,
"grad_norm": 4.412489990442917,
"kl": 0.09765625,
"learning_rate": 3.0722891566265056e-07,
"loss": 0.0039,
"reward": 2.421238660812378,
"reward_std": 0.21779820322990417,
"rewards/accuracy_reward": 1.4290512800216675,
"rewards/format_reward": 0.9921875,
"step": 345
},
{
"completion_length": 67.3984375,
"epoch": 4.168674698795181,
"grad_norm": 3.7050619604015775,
"kl": 0.111083984375,
"learning_rate": 3.0522088353413654e-07,
"loss": 0.0044,
"reward": 2.4159966707229614,
"reward_std": 0.17116259038448334,
"rewards/accuracy_reward": 1.4159966707229614,
"rewards/format_reward": 1.0,
"step": 346
},
{
"completion_length": 68.7109375,
"epoch": 4.180722891566265,
"grad_norm": 4.638840034522594,
"kl": 0.119873046875,
"learning_rate": 3.0321285140562247e-07,
"loss": 0.0048,
"reward": 2.430918335914612,
"reward_std": 0.23829656839370728,
"rewards/accuracy_reward": 1.4309183359146118,
"rewards/format_reward": 1.0,
"step": 347
},
{
"completion_length": 68.203125,
"epoch": 4.192771084337349,
"grad_norm": 7.531973472034052,
"kl": 0.124267578125,
"learning_rate": 3.0120481927710845e-07,
"loss": 0.005,
"reward": 2.2654261589050293,
"reward_std": 0.214869923889637,
"rewards/accuracy_reward": 1.2966760993003845,
"rewards/format_reward": 0.96875,
"step": 348
},
{
"completion_length": 66.3046875,
"epoch": 4.204819277108434,
"grad_norm": 6.290139006407989,
"kl": 0.15673828125,
"learning_rate": 2.991967871485944e-07,
"loss": 0.0063,
"reward": 2.440833330154419,
"reward_std": 0.20570393651723862,
"rewards/accuracy_reward": 1.4642709493637085,
"rewards/format_reward": 0.9765625,
"step": 349
},
{
"completion_length": 68.5078125,
"epoch": 4.216867469879518,
"grad_norm": 3.870085506410607,
"kl": 0.11376953125,
"learning_rate": 2.971887550200803e-07,
"loss": 0.0046,
"reward": 2.4419082403182983,
"reward_std": 0.1332126259803772,
"rewards/accuracy_reward": 1.441908359527588,
"rewards/format_reward": 1.0,
"step": 350
},
{
"completion_length": 67.7109375,
"epoch": 4.228915662650603,
"grad_norm": 5.222390077968289,
"kl": 0.12548828125,
"learning_rate": 2.9518072289156623e-07,
"loss": 0.005,
"reward": 2.354392647743225,
"reward_std": 0.250136561691761,
"rewards/accuracy_reward": 1.3700175285339355,
"rewards/format_reward": 0.984375,
"step": 351
},
{
"completion_length": 63.75,
"epoch": 4.240963855421687,
"grad_norm": 5.7394258697520835,
"kl": 0.13671875,
"learning_rate": 2.9317269076305216e-07,
"loss": 0.0055,
"reward": 2.1846532821655273,
"reward_std": 0.27685467153787613,
"rewards/accuracy_reward": 1.2080907225608826,
"rewards/format_reward": 0.9765625,
"step": 352
},
{
"completion_length": 68.734375,
"epoch": 4.253012048192771,
"grad_norm": 3.522967170920438,
"kl": 0.10400390625,
"learning_rate": 2.9116465863453814e-07,
"loss": 0.0041,
"reward": 2.315014600753784,
"reward_std": 0.13816260546445847,
"rewards/accuracy_reward": 1.3150146007537842,
"rewards/format_reward": 1.0,
"step": 353
},
{
"completion_length": 72.8125,
"epoch": 4.265060240963855,
"grad_norm": 3.727859373676823,
"kl": 0.12939453125,
"learning_rate": 2.8915662650602407e-07,
"loss": 0.0052,
"reward": 2.206972360610962,
"reward_std": 0.23467965424060822,
"rewards/accuracy_reward": 1.2069722414016724,
"rewards/format_reward": 1.0,
"step": 354
},
{
"completion_length": 70.3359375,
"epoch": 4.27710843373494,
"grad_norm": 3.380662774166939,
"kl": 0.09716796875,
"learning_rate": 2.8714859437751005e-07,
"loss": 0.0039,
"reward": 2.1916306018829346,
"reward_std": 0.23339906334877014,
"rewards/accuracy_reward": 1.2072556018829346,
"rewards/format_reward": 0.984375,
"step": 355
},
{
"completion_length": 72.4375,
"epoch": 4.289156626506024,
"grad_norm": 3.5703829288777764,
"kl": 0.11376953125,
"learning_rate": 2.85140562248996e-07,
"loss": 0.0046,
"reward": 2.142443895339966,
"reward_std": 0.2050827294588089,
"rewards/accuracy_reward": 1.1580689549446106,
"rewards/format_reward": 0.984375,
"step": 356
},
{
"completion_length": 66.9921875,
"epoch": 4.301204819277109,
"grad_norm": 3.6787951883313275,
"kl": 0.119873046875,
"learning_rate": 2.8313253012048195e-07,
"loss": 0.0048,
"reward": 2.6013587713241577,
"reward_std": 0.17792491614818573,
"rewards/accuracy_reward": 1.6013588309288025,
"rewards/format_reward": 1.0,
"step": 357
},
{
"completion_length": 67.1875,
"epoch": 4.313253012048193,
"grad_norm": 7.9299540096420476,
"kl": 0.111328125,
"learning_rate": 2.811244979919679e-07,
"loss": 0.0044,
"reward": 2.2114800214767456,
"reward_std": 0.2541910707950592,
"rewards/accuracy_reward": 1.2271050810813904,
"rewards/format_reward": 0.984375,
"step": 358
},
{
"completion_length": 69.1953125,
"epoch": 4.325301204819277,
"grad_norm": 3.7315177619787687,
"kl": 0.10400390625,
"learning_rate": 2.7911646586345376e-07,
"loss": 0.0042,
"reward": 2.2850147485733032,
"reward_std": 0.24116653203964233,
"rewards/accuracy_reward": 1.3084524869918823,
"rewards/format_reward": 0.9765625,
"step": 359
},
{
"completion_length": 76.6640625,
"epoch": 4.337349397590361,
"grad_norm": 3.8031600707561886,
"kl": 0.08984375,
"learning_rate": 2.7710843373493974e-07,
"loss": 0.0036,
"reward": 2.372725009918213,
"reward_std": 0.23598377406597137,
"rewards/accuracy_reward": 1.380537509918213,
"rewards/format_reward": 0.9921875,
"step": 360
},
{
"completion_length": 72.6015625,
"epoch": 4.349397590361446,
"grad_norm": 6.29903230301134,
"kl": 0.10205078125,
"learning_rate": 2.7510040160642566e-07,
"loss": 0.0041,
"reward": 2.3671088218688965,
"reward_std": 0.21375955641269684,
"rewards/accuracy_reward": 1.3749213814735413,
"rewards/format_reward": 0.9921875,
"step": 361
},
{
"completion_length": 74.546875,
"epoch": 4.36144578313253,
"grad_norm": 4.5097271327174555,
"kl": 0.100341796875,
"learning_rate": 2.7309236947791164e-07,
"loss": 0.004,
"reward": 2.338581085205078,
"reward_std": 0.21793486177921295,
"rewards/accuracy_reward": 1.3463934063911438,
"rewards/format_reward": 0.9921875,
"step": 362
},
{
"completion_length": 73.203125,
"epoch": 4.373493975903615,
"grad_norm": 7.563928087147195,
"kl": 0.093505859375,
"learning_rate": 2.7108433734939757e-07,
"loss": 0.0037,
"reward": 2.4811813831329346,
"reward_std": 0.1661686971783638,
"rewards/accuracy_reward": 1.4811814427375793,
"rewards/format_reward": 1.0,
"step": 363
},
{
"completion_length": 72.2109375,
"epoch": 4.385542168674699,
"grad_norm": 4.157739455544304,
"kl": 0.11767578125,
"learning_rate": 2.6907630522088355e-07,
"loss": 0.0047,
"reward": 2.227518320083618,
"reward_std": 0.2459297701716423,
"rewards/accuracy_reward": 1.235330879688263,
"rewards/format_reward": 0.9921875,
"step": 364
},
{
"completion_length": 73.125,
"epoch": 4.397590361445783,
"grad_norm": 3.957643739786318,
"kl": 0.130126953125,
"learning_rate": 2.670682730923695e-07,
"loss": 0.0052,
"reward": 2.398737668991089,
"reward_std": 0.2508920058608055,
"rewards/accuracy_reward": 1.406550109386444,
"rewards/format_reward": 0.9921875,
"step": 365
},
{
"completion_length": 80.6484375,
"epoch": 4.409638554216867,
"grad_norm": 8.267939908268028,
"kl": 0.126220703125,
"learning_rate": 2.6506024096385546e-07,
"loss": 0.005,
"reward": 2.1884970664978027,
"reward_std": 0.32723745703697205,
"rewards/accuracy_reward": 1.2119346857070923,
"rewards/format_reward": 0.9765625,
"step": 366
},
{
"completion_length": 80.09375,
"epoch": 4.421686746987952,
"grad_norm": 3.0023836541953988,
"kl": 0.089111328125,
"learning_rate": 2.6305220883534133e-07,
"loss": 0.0036,
"reward": 2.4019484519958496,
"reward_std": 0.20879995077848434,
"rewards/accuracy_reward": 1.4019483923912048,
"rewards/format_reward": 1.0,
"step": 367
},
{
"completion_length": 76.890625,
"epoch": 4.433734939759036,
"grad_norm": 3.8760535577901916,
"kl": 0.110107421875,
"learning_rate": 2.610441767068273e-07,
"loss": 0.0044,
"reward": 2.217389702796936,
"reward_std": 0.20581622421741486,
"rewards/accuracy_reward": 1.225202202796936,
"rewards/format_reward": 0.9921875,
"step": 368
},
{
"completion_length": 70.046875,
"epoch": 4.445783132530121,
"grad_norm": 4.189426211226252,
"kl": 0.09912109375,
"learning_rate": 2.5903614457831324e-07,
"loss": 0.004,
"reward": 2.3884357213974,
"reward_std": 0.23216703534126282,
"rewards/accuracy_reward": 1.4118732213974,
"rewards/format_reward": 0.9765625,
"step": 369
},
{
"completion_length": 75.3125,
"epoch": 4.457831325301205,
"grad_norm": 3.5709834038432886,
"kl": 0.112060546875,
"learning_rate": 2.5702811244979916e-07,
"loss": 0.0045,
"reward": 2.4395360946655273,
"reward_std": 0.25345855951309204,
"rewards/accuracy_reward": 1.4551611542701721,
"rewards/format_reward": 0.984375,
"step": 370
},
{
"completion_length": 76.03125,
"epoch": 4.469879518072289,
"grad_norm": 3.8012985013892897,
"kl": 0.11962890625,
"learning_rate": 2.5502008032128514e-07,
"loss": 0.0048,
"reward": 2.2614444494247437,
"reward_std": 0.25984859466552734,
"rewards/accuracy_reward": 1.2692569494247437,
"rewards/format_reward": 0.9921875,
"step": 371
},
{
"completion_length": 72.34375,
"epoch": 4.481927710843373,
"grad_norm": 3.81905493683615,
"kl": 0.118408203125,
"learning_rate": 2.5301204819277107e-07,
"loss": 0.0047,
"reward": 2.24534273147583,
"reward_std": 0.2783522978425026,
"rewards/accuracy_reward": 1.25315523147583,
"rewards/format_reward": 0.9921875,
"step": 372
},
{
"completion_length": 73.625,
"epoch": 4.493975903614458,
"grad_norm": 5.859434170398068,
"kl": 0.129638671875,
"learning_rate": 2.5100401606425705e-07,
"loss": 0.0052,
"reward": 2.242166519165039,
"reward_std": 0.19818732887506485,
"rewards/accuracy_reward": 1.2421664595603943,
"rewards/format_reward": 1.0,
"step": 373
},
{
"completion_length": 70.7734375,
"epoch": 4.506024096385542,
"grad_norm": 4.577359942879205,
"kl": 0.113037109375,
"learning_rate": 2.489959839357429e-07,
"loss": 0.0045,
"reward": 2.40807843208313,
"reward_std": 0.16506175324320793,
"rewards/accuracy_reward": 1.408078372478485,
"rewards/format_reward": 1.0,
"step": 374
},
{
"completion_length": 71.6484375,
"epoch": 4.518072289156627,
"grad_norm": 3.6969886550918627,
"kl": 0.0947265625,
"learning_rate": 2.469879518072289e-07,
"loss": 0.0038,
"reward": 2.4090828895568848,
"reward_std": 0.17872843891382217,
"rewards/accuracy_reward": 1.4090829491615295,
"rewards/format_reward": 1.0,
"step": 375
},
{
"completion_length": 75.640625,
"epoch": 4.530120481927711,
"grad_norm": 3.182069910394249,
"kl": 0.112548828125,
"learning_rate": 2.4497991967871483e-07,
"loss": 0.0045,
"reward": 2.429325222969055,
"reward_std": 0.18355486541986465,
"rewards/accuracy_reward": 1.4371376037597656,
"rewards/format_reward": 0.9921875,
"step": 376
},
{
"completion_length": 76.8515625,
"epoch": 4.542168674698795,
"grad_norm": 4.3761923522139625,
"kl": 0.103515625,
"learning_rate": 2.429718875502008e-07,
"loss": 0.0041,
"reward": 2.215627670288086,
"reward_std": 0.29024538397789,
"rewards/accuracy_reward": 1.2234401106834412,
"rewards/format_reward": 0.9921875,
"step": 377
},
{
"completion_length": 72.640625,
"epoch": 4.554216867469879,
"grad_norm": 5.739152465768093,
"kl": 0.096923828125,
"learning_rate": 2.4096385542168674e-07,
"loss": 0.0039,
"reward": 2.3864386081695557,
"reward_std": 0.14991050213575363,
"rewards/accuracy_reward": 1.3864384889602661,
"rewards/format_reward": 1.0,
"step": 378
},
{
"completion_length": 73.7890625,
"epoch": 4.566265060240964,
"grad_norm": 4.330609617515541,
"kl": 0.105712890625,
"learning_rate": 2.3895582329317267e-07,
"loss": 0.0042,
"reward": 2.2676793336868286,
"reward_std": 0.1841476932168007,
"rewards/accuracy_reward": 1.2754917740821838,
"rewards/format_reward": 0.9921875,
"step": 379
},
{
"completion_length": 69.5859375,
"epoch": 4.578313253012048,
"grad_norm": 16.70825245009543,
"kl": 0.103515625,
"learning_rate": 2.3694779116465862e-07,
"loss": 0.0041,
"reward": 2.3687047958374023,
"reward_std": 0.23368250578641891,
"rewards/accuracy_reward": 1.3765172958374023,
"rewards/format_reward": 0.9921875,
"step": 380
},
{
"completion_length": 68.5703125,
"epoch": 4.590361445783133,
"grad_norm": 4.946973705468274,
"kl": 0.11865234375,
"learning_rate": 2.3493975903614457e-07,
"loss": 0.0047,
"reward": 2.409714102745056,
"reward_std": 0.17494437843561172,
"rewards/accuracy_reward": 1.4175265431404114,
"rewards/format_reward": 0.9921875,
"step": 381
},
{
"completion_length": 69.09375,
"epoch": 4.602409638554217,
"grad_norm": 3.4407209788639155,
"kl": 0.108154296875,
"learning_rate": 2.3293172690763053e-07,
"loss": 0.0043,
"reward": 2.3722596168518066,
"reward_std": 0.2456066906452179,
"rewards/accuracy_reward": 1.3722596764564514,
"rewards/format_reward": 1.0,
"step": 382
},
{
"completion_length": 73.40625,
"epoch": 4.614457831325301,
"grad_norm": 6.785057754949663,
"kl": 0.093017578125,
"learning_rate": 2.3092369477911648e-07,
"loss": 0.0037,
"reward": 2.390730619430542,
"reward_std": 0.13034258037805557,
"rewards/accuracy_reward": 1.390730619430542,
"rewards/format_reward": 1.0,
"step": 383
},
{
"completion_length": 69.578125,
"epoch": 4.626506024096385,
"grad_norm": 4.146766679362004,
"kl": 0.110107421875,
"learning_rate": 2.2891566265060238e-07,
"loss": 0.0044,
"reward": 2.457837224006653,
"reward_std": 0.19646844267845154,
"rewards/accuracy_reward": 1.465649664402008,
"rewards/format_reward": 0.9921875,
"step": 384
},
{
"completion_length": 71.4765625,
"epoch": 4.63855421686747,
"grad_norm": 3.5134218173180884,
"kl": 0.10791015625,
"learning_rate": 2.2690763052208834e-07,
"loss": 0.0043,
"reward": 2.2395870685577393,
"reward_std": 0.23986083269119263,
"rewards/accuracy_reward": 1.2630245089530945,
"rewards/format_reward": 0.9765625,
"step": 385
},
{
"completion_length": 67.8984375,
"epoch": 4.650602409638554,
"grad_norm": 3.5532098801033323,
"kl": 0.112060546875,
"learning_rate": 2.248995983935743e-07,
"loss": 0.0045,
"reward": 2.155800759792328,
"reward_std": 0.26599714159965515,
"rewards/accuracy_reward": 1.1714258790016174,
"rewards/format_reward": 0.984375,
"step": 386
},
{
"completion_length": 67.921875,
"epoch": 4.662650602409639,
"grad_norm": 3.977191337497143,
"kl": 0.12353515625,
"learning_rate": 2.2289156626506022e-07,
"loss": 0.0049,
"reward": 2.1573885679244995,
"reward_std": 0.19674725830554962,
"rewards/accuracy_reward": 1.165201187133789,
"rewards/format_reward": 0.9921875,
"step": 387
},
{
"completion_length": 73.3671875,
"epoch": 4.674698795180722,
"grad_norm": 3.4384187805900894,
"kl": 0.1005859375,
"learning_rate": 2.2088353413654617e-07,
"loss": 0.004,
"reward": 2.238619089126587,
"reward_std": 0.1663391888141632,
"rewards/accuracy_reward": 1.2386190295219421,
"rewards/format_reward": 1.0,
"step": 388
},
{
"completion_length": 71.3515625,
"epoch": 4.686746987951807,
"grad_norm": 3.6715987846617737,
"kl": 0.1103515625,
"learning_rate": 2.1887550200803212e-07,
"loss": 0.0044,
"reward": 2.2813053131103516,
"reward_std": 0.20307840406894684,
"rewards/accuracy_reward": 1.2891177535057068,
"rewards/format_reward": 0.9921875,
"step": 389
},
{
"completion_length": 67.8671875,
"epoch": 4.698795180722891,
"grad_norm": 4.1990886176906566,
"kl": 0.1181640625,
"learning_rate": 2.1686746987951808e-07,
"loss": 0.0047,
"reward": 2.3316123485565186,
"reward_std": 0.18899912387132645,
"rewards/accuracy_reward": 1.339424967765808,
"rewards/format_reward": 0.9921875,
"step": 390
},
{
"completion_length": 73.5390625,
"epoch": 4.710843373493976,
"grad_norm": 4.5848307121684035,
"kl": 0.11767578125,
"learning_rate": 2.14859437751004e-07,
"loss": 0.0047,
"reward": 2.3556346893310547,
"reward_std": 0.17518161982297897,
"rewards/accuracy_reward": 1.3634473085403442,
"rewards/format_reward": 0.9921875,
"step": 391
},
{
"completion_length": 73.3828125,
"epoch": 4.72289156626506,
"grad_norm": 4.308895887462787,
"kl": 0.09716796875,
"learning_rate": 2.1285140562248996e-07,
"loss": 0.0039,
"reward": 2.3230199813842773,
"reward_std": 0.2215501293540001,
"rewards/accuracy_reward": 1.3230200409889221,
"rewards/format_reward": 1.0,
"step": 392
},
{
"completion_length": 71.625,
"epoch": 4.734939759036145,
"grad_norm": 3.8869195849917335,
"kl": 0.117919921875,
"learning_rate": 2.108433734939759e-07,
"loss": 0.0047,
"reward": 2.311624765396118,
"reward_std": 0.233637273311615,
"rewards/accuracy_reward": 1.3116250038146973,
"rewards/format_reward": 1.0,
"step": 393
},
{
"completion_length": 67.828125,
"epoch": 4.746987951807229,
"grad_norm": 4.950759054297939,
"kl": 0.10888671875,
"learning_rate": 2.0883534136546184e-07,
"loss": 0.0044,
"reward": 2.379747152328491,
"reward_std": 0.19298578798770905,
"rewards/accuracy_reward": 1.3797469735145569,
"rewards/format_reward": 1.0,
"step": 394
},
{
"completion_length": 72.2578125,
"epoch": 4.759036144578313,
"grad_norm": 45.47765651174386,
"kl": 0.126708984375,
"learning_rate": 2.0682730923694776e-07,
"loss": 0.0051,
"reward": 2.078563928604126,
"reward_std": 0.253988578915596,
"rewards/accuracy_reward": 1.0941888689994812,
"rewards/format_reward": 0.984375,
"step": 395
},
{
"completion_length": 71.6484375,
"epoch": 4.771084337349397,
"grad_norm": 6.044646695827286,
"kl": 0.13916015625,
"learning_rate": 2.0481927710843372e-07,
"loss": 0.0056,
"reward": 2.485829472541809,
"reward_std": 0.180104598402977,
"rewards/accuracy_reward": 1.4858292937278748,
"rewards/format_reward": 1.0,
"step": 396
},
{
"completion_length": 65.09375,
"epoch": 4.783132530120482,
"grad_norm": 4.360820446081869,
"kl": 0.1416015625,
"learning_rate": 2.0281124497991967e-07,
"loss": 0.0057,
"reward": 2.1638635396957397,
"reward_std": 0.31551285088062286,
"rewards/accuracy_reward": 1.1873010993003845,
"rewards/format_reward": 0.9765625,
"step": 397
},
{
"completion_length": 70.6328125,
"epoch": 4.795180722891566,
"grad_norm": 5.234619949658262,
"kl": 0.115966796875,
"learning_rate": 2.0080321285140563e-07,
"loss": 0.0046,
"reward": 2.424190402030945,
"reward_std": 0.23157334327697754,
"rewards/accuracy_reward": 1.4241904616355896,
"rewards/format_reward": 1.0,
"step": 398
},
{
"completion_length": 70.4375,
"epoch": 4.807228915662651,
"grad_norm": 5.2543384630783265,
"kl": 0.12060546875,
"learning_rate": 1.9879518072289155e-07,
"loss": 0.0048,
"reward": 2.3333520889282227,
"reward_std": 0.2145429253578186,
"rewards/accuracy_reward": 1.3411647081375122,
"rewards/format_reward": 0.9921875,
"step": 399
},
{
"completion_length": 65.421875,
"epoch": 4.8192771084337345,
"grad_norm": 6.050688926597152,
"kl": 0.125732421875,
"learning_rate": 1.967871485943775e-07,
"loss": 0.005,
"reward": 2.412783145904541,
"reward_std": 0.2059781178832054,
"rewards/accuracy_reward": 1.420595645904541,
"rewards/format_reward": 0.9921875,
"step": 400
},
{
"completion_length": 63.5546875,
"epoch": 4.831325301204819,
"grad_norm": 4.14350718873446,
"kl": 0.143798828125,
"learning_rate": 1.9477911646586346e-07,
"loss": 0.0057,
"reward": 2.3667309284210205,
"reward_std": 0.1764308363199234,
"rewards/accuracy_reward": 1.3745434284210205,
"rewards/format_reward": 0.9921875,
"step": 401
},
{
"completion_length": 71.8671875,
"epoch": 4.843373493975903,
"grad_norm": 4.134424932683493,
"kl": 0.126953125,
"learning_rate": 1.9277108433734939e-07,
"loss": 0.0051,
"reward": 2.2129541635513306,
"reward_std": 0.1565767452120781,
"rewards/accuracy_reward": 1.2129541635513306,
"rewards/format_reward": 1.0,
"step": 402
},
{
"completion_length": 64.0390625,
"epoch": 4.855421686746988,
"grad_norm": 4.135875391105592,
"kl": 0.166015625,
"learning_rate": 1.9076305220883534e-07,
"loss": 0.0066,
"reward": 2.3259581327438354,
"reward_std": 0.2349315583705902,
"rewards/accuracy_reward": 1.3259583115577698,
"rewards/format_reward": 1.0,
"step": 403
},
{
"completion_length": 66.515625,
"epoch": 4.867469879518072,
"grad_norm": 4.276605246406482,
"kl": 0.138916015625,
"learning_rate": 1.8875502008032127e-07,
"loss": 0.0056,
"reward": 2.306966781616211,
"reward_std": 0.2081274688243866,
"rewards/accuracy_reward": 1.3069666624069214,
"rewards/format_reward": 1.0,
"step": 404
},
{
"completion_length": 62.28125,
"epoch": 4.879518072289157,
"grad_norm": 4.594134632277065,
"kl": 0.1826171875,
"learning_rate": 1.8674698795180722e-07,
"loss": 0.0073,
"reward": 2.126552700996399,
"reward_std": 0.255823478102684,
"rewards/accuracy_reward": 1.1421778202056885,
"rewards/format_reward": 0.984375,
"step": 405
},
{
"completion_length": 62.3671875,
"epoch": 4.891566265060241,
"grad_norm": 3.568434088807843,
"kl": 0.14013671875,
"learning_rate": 1.8473895582329315e-07,
"loss": 0.0056,
"reward": 2.417848587036133,
"reward_std": 0.22225632518529892,
"rewards/accuracy_reward": 1.4334735870361328,
"rewards/format_reward": 0.984375,
"step": 406
},
{
"completion_length": 66.5078125,
"epoch": 4.903614457831325,
"grad_norm": 4.123527789276523,
"kl": 0.10986328125,
"learning_rate": 1.827309236947791e-07,
"loss": 0.0044,
"reward": 2.294624924659729,
"reward_std": 0.19924252480268478,
"rewards/accuracy_reward": 1.3024373650550842,
"rewards/format_reward": 0.9921875,
"step": 407
},
{
"completion_length": 66.390625,
"epoch": 4.9156626506024095,
"grad_norm": 3.62978164804241,
"kl": 0.12890625,
"learning_rate": 1.8072289156626505e-07,
"loss": 0.0051,
"reward": 2.543404698371887,
"reward_std": 0.1362360306084156,
"rewards/accuracy_reward": 1.5434046983718872,
"rewards/format_reward": 1.0,
"step": 408
},
{
"completion_length": 63.9765625,
"epoch": 4.927710843373494,
"grad_norm": 4.35384844886202,
"kl": 0.12890625,
"learning_rate": 1.78714859437751e-07,
"loss": 0.0052,
"reward": 2.418124198913574,
"reward_std": 0.22236012667417526,
"rewards/accuracy_reward": 1.4337490797042847,
"rewards/format_reward": 0.984375,
"step": 409
},
{
"completion_length": 68.90625,
"epoch": 4.9397590361445785,
"grad_norm": 5.014972518639089,
"kl": 0.1103515625,
"learning_rate": 1.7670682730923694e-07,
"loss": 0.0044,
"reward": 2.4006751775741577,
"reward_std": 0.16714774072170258,
"rewards/accuracy_reward": 1.4006752967834473,
"rewards/format_reward": 1.0,
"step": 410
},
{
"completion_length": 69.59375,
"epoch": 4.951807228915663,
"grad_norm": 7.696032017895469,
"kl": 0.13916015625,
"learning_rate": 1.746987951807229e-07,
"loss": 0.0056,
"reward": 2.395194172859192,
"reward_std": 0.16039493680000305,
"rewards/accuracy_reward": 1.3951941132545471,
"rewards/format_reward": 1.0,
"step": 411
},
{
"completion_length": 70.125,
"epoch": 4.9638554216867465,
"grad_norm": 4.628350833888434,
"kl": 0.149169921875,
"learning_rate": 1.7269076305220884e-07,
"loss": 0.006,
"reward": 2.1348607540130615,
"reward_std": 0.1709538996219635,
"rewards/accuracy_reward": 1.1348606944084167,
"rewards/format_reward": 1.0,
"step": 412
},
{
"completion_length": 66.2109375,
"epoch": 4.975903614457831,
"grad_norm": 3.188607704812383,
"kl": 0.12646484375,
"learning_rate": 1.706827309236948e-07,
"loss": 0.0051,
"reward": 2.302504062652588,
"reward_std": 0.2623682767152786,
"rewards/accuracy_reward": 1.3181291222572327,
"rewards/format_reward": 0.984375,
"step": 413
},
{
"completion_length": 64.171875,
"epoch": 4.9879518072289155,
"grad_norm": 3.9665667179390773,
"kl": 0.128662109375,
"learning_rate": 1.686746987951807e-07,
"loss": 0.0052,
"reward": 2.4097338914871216,
"reward_std": 0.17293449118733406,
"rewards/accuracy_reward": 1.4097338318824768,
"rewards/format_reward": 1.0,
"step": 414
},
{
"completion_length": 77.33333587646484,
"epoch": 5.0,
"grad_norm": 3.313170759959086,
"kl": 0.1083984375,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.004,
"reward": 2.2759520411491394,
"reward_std": 0.1403224766254425,
"rewards/accuracy_reward": 1.2759520411491394,
"rewards/format_reward": 1.0,
"step": 415
},
{
"completion_length": 66.3203125,
"epoch": 5.0120481927710845,
"grad_norm": 4.277881132595083,
"kl": 0.14306640625,
"learning_rate": 1.646586345381526e-07,
"loss": 0.0057,
"reward": 2.373741865158081,
"reward_std": 0.20744601637125015,
"rewards/accuracy_reward": 1.3815542459487915,
"rewards/format_reward": 0.9921875,
"step": 416
},
{
"completion_length": 66.53125,
"epoch": 5.024096385542169,
"grad_norm": 3.9929439696450575,
"kl": 0.12939453125,
"learning_rate": 1.6265060240963853e-07,
"loss": 0.0052,
"reward": 2.35166335105896,
"reward_std": 0.2503097951412201,
"rewards/accuracy_reward": 1.35166335105896,
"rewards/format_reward": 1.0,
"step": 417
},
{
"completion_length": 68.625,
"epoch": 5.036144578313253,
"grad_norm": 4.023924792103433,
"kl": 0.114013671875,
"learning_rate": 1.6064257028112448e-07,
"loss": 0.0046,
"reward": 2.2476612329483032,
"reward_std": 0.185993991792202,
"rewards/accuracy_reward": 1.2554737329483032,
"rewards/format_reward": 0.9921875,
"step": 418
},
{
"completion_length": 65.7421875,
"epoch": 5.048192771084337,
"grad_norm": 3.5711137415239618,
"kl": 0.134033203125,
"learning_rate": 1.5863453815261044e-07,
"loss": 0.0054,
"reward": 2.2856324911117554,
"reward_std": 0.14102690666913986,
"rewards/accuracy_reward": 1.2856324911117554,
"rewards/format_reward": 1.0,
"step": 419
},
{
"completion_length": 65.1328125,
"epoch": 5.0602409638554215,
"grad_norm": 5.8881280705003505,
"kl": 0.1259765625,
"learning_rate": 1.566265060240964e-07,
"loss": 0.005,
"reward": 2.474275588989258,
"reward_std": 0.2030300498008728,
"rewards/accuracy_reward": 1.474275529384613,
"rewards/format_reward": 1.0,
"step": 420
},
{
"completion_length": 59.453125,
"epoch": 5.072289156626506,
"grad_norm": 17.487945694806488,
"kl": 0.1279296875,
"learning_rate": 1.5461847389558232e-07,
"loss": 0.0051,
"reward": 2.468233823776245,
"reward_std": 0.17333931475877762,
"rewards/accuracy_reward": 1.4682338237762451,
"rewards/format_reward": 1.0,
"step": 421
},
{
"completion_length": 67.7421875,
"epoch": 5.0843373493975905,
"grad_norm": 4.5642738703913865,
"kl": 0.12646484375,
"learning_rate": 1.5261044176706827e-07,
"loss": 0.0051,
"reward": 2.39510977268219,
"reward_std": 0.1837218478322029,
"rewards/accuracy_reward": 1.3951098918914795,
"rewards/format_reward": 1.0,
"step": 422
},
{
"completion_length": 64.515625,
"epoch": 5.096385542168675,
"grad_norm": 7.684070732359071,
"kl": 0.139892578125,
"learning_rate": 1.5060240963855423e-07,
"loss": 0.0056,
"reward": 2.16294264793396,
"reward_std": 0.14895135164260864,
"rewards/accuracy_reward": 1.1707550883293152,
"rewards/format_reward": 0.9921875,
"step": 423
},
{
"completion_length": 64.46875,
"epoch": 5.108433734939759,
"grad_norm": 3.930344733874979,
"kl": 0.11669921875,
"learning_rate": 1.4859437751004015e-07,
"loss": 0.0047,
"reward": 2.3980486392974854,
"reward_std": 0.15896277129650116,
"rewards/accuracy_reward": 1.3980485796928406,
"rewards/format_reward": 1.0,
"step": 424
},
{
"completion_length": 68.875,
"epoch": 5.120481927710843,
"grad_norm": 6.912033255857147,
"kl": 0.118896484375,
"learning_rate": 1.4658634538152608e-07,
"loss": 0.0048,
"reward": 2.4401201009750366,
"reward_std": 0.18969366699457169,
"rewards/accuracy_reward": 1.440119981765747,
"rewards/format_reward": 1.0,
"step": 425
},
{
"completion_length": 65.609375,
"epoch": 5.132530120481928,
"grad_norm": 3.6477005267341163,
"kl": 0.1708984375,
"learning_rate": 1.4457831325301203e-07,
"loss": 0.0068,
"reward": 2.300011992454529,
"reward_std": 0.2104162722826004,
"rewards/accuracy_reward": 1.300011932849884,
"rewards/format_reward": 1.0,
"step": 426
},
{
"completion_length": 65.0859375,
"epoch": 5.144578313253012,
"grad_norm": 5.390081007205584,
"kl": 0.12548828125,
"learning_rate": 1.42570281124498e-07,
"loss": 0.005,
"reward": 2.407547354698181,
"reward_std": 0.19479839503765106,
"rewards/accuracy_reward": 1.4075472354888916,
"rewards/format_reward": 1.0,
"step": 427
},
{
"completion_length": 65.8046875,
"epoch": 5.156626506024097,
"grad_norm": 5.842696773596783,
"kl": 0.12255859375,
"learning_rate": 1.4056224899598394e-07,
"loss": 0.0049,
"reward": 2.2872836589813232,
"reward_std": 0.2501709461212158,
"rewards/accuracy_reward": 1.2950963973999023,
"rewards/format_reward": 0.9921875,
"step": 428
},
{
"completion_length": 67.2890625,
"epoch": 5.168674698795181,
"grad_norm": 3.9373211288360612,
"kl": 0.134765625,
"learning_rate": 1.3855421686746987e-07,
"loss": 0.0054,
"reward": 2.4114162921905518,
"reward_std": 0.22173649817705154,
"rewards/accuracy_reward": 1.419228732585907,
"rewards/format_reward": 0.9921875,
"step": 429
},
{
"completion_length": 65.7265625,
"epoch": 5.180722891566265,
"grad_norm": 5.989728831260378,
"kl": 0.20263671875,
"learning_rate": 1.3654618473895582e-07,
"loss": 0.0081,
"reward": 2.349661111831665,
"reward_std": 0.24485966563224792,
"rewards/accuracy_reward": 1.3496609926223755,
"rewards/format_reward": 1.0,
"step": 430
},
{
"completion_length": 71.0390625,
"epoch": 5.192771084337349,
"grad_norm": 4.9722233041190425,
"kl": 0.11083984375,
"learning_rate": 1.3453815261044177e-07,
"loss": 0.0044,
"reward": 2.423168659210205,
"reward_std": 0.16536322236061096,
"rewards/accuracy_reward": 1.4231685996055603,
"rewards/format_reward": 1.0,
"step": 431
},
{
"completion_length": 66.234375,
"epoch": 5.204819277108434,
"grad_norm": 3.5058259130400162,
"kl": 0.1376953125,
"learning_rate": 1.3253012048192773e-07,
"loss": 0.0055,
"reward": 2.2352651357650757,
"reward_std": 0.18688317388296127,
"rewards/accuracy_reward": 1.2352651357650757,
"rewards/format_reward": 1.0,
"step": 432
},
{
"completion_length": 72.8203125,
"epoch": 5.216867469879518,
"grad_norm": 3.8748331360003485,
"kl": 0.130859375,
"learning_rate": 1.3052208835341366e-07,
"loss": 0.0052,
"reward": 2.3151748180389404,
"reward_std": 0.21110112965106964,
"rewards/accuracy_reward": 1.3229871988296509,
"rewards/format_reward": 0.9921875,
"step": 433
},
{
"completion_length": 68.8671875,
"epoch": 5.228915662650603,
"grad_norm": 3.985332448415374,
"kl": 0.1220703125,
"learning_rate": 1.2851405622489958e-07,
"loss": 0.0049,
"reward": 2.26615047454834,
"reward_std": 0.20259422063827515,
"rewards/accuracy_reward": 1.2739630937576294,
"rewards/format_reward": 0.9921875,
"step": 434
},
{
"completion_length": 64.0234375,
"epoch": 5.240963855421687,
"grad_norm": 4.209088113123041,
"kl": 0.119873046875,
"learning_rate": 1.2650602409638554e-07,
"loss": 0.0048,
"reward": 2.345677137374878,
"reward_std": 0.16655350476503372,
"rewards/accuracy_reward": 1.345677137374878,
"rewards/format_reward": 1.0,
"step": 435
},
{
"completion_length": 72.2109375,
"epoch": 5.253012048192771,
"grad_norm": 3.7180924645581994,
"kl": 0.13427734375,
"learning_rate": 1.2449799196787146e-07,
"loss": 0.0054,
"reward": 2.163213849067688,
"reward_std": 0.3149610310792923,
"rewards/accuracy_reward": 1.1866515278816223,
"rewards/format_reward": 0.9765625,
"step": 436
},
{
"completion_length": 65.328125,
"epoch": 5.265060240963855,
"grad_norm": 3.8280472693841556,
"kl": 0.12744140625,
"learning_rate": 1.2248995983935742e-07,
"loss": 0.0051,
"reward": 2.3446794748306274,
"reward_std": 0.22430174052715302,
"rewards/accuracy_reward": 1.3446794152259827,
"rewards/format_reward": 1.0,
"step": 437
},
{
"completion_length": 64.65625,
"epoch": 5.27710843373494,
"grad_norm": 5.861122122648032,
"kl": 0.12060546875,
"learning_rate": 1.2048192771084337e-07,
"loss": 0.0048,
"reward": 2.379356861114502,
"reward_std": 0.1506607085466385,
"rewards/accuracy_reward": 1.3871691226959229,
"rewards/format_reward": 0.9921875,
"step": 438
},
{
"completion_length": 71.1171875,
"epoch": 5.289156626506024,
"grad_norm": 3.8119653679452092,
"kl": 0.12353515625,
"learning_rate": 1.1847389558232931e-07,
"loss": 0.0049,
"reward": 2.388357400894165,
"reward_std": 0.23687779903411865,
"rewards/accuracy_reward": 1.3961697816848755,
"rewards/format_reward": 0.9921875,
"step": 439
},
{
"completion_length": 72.3515625,
"epoch": 5.301204819277109,
"grad_norm": 3.9178115284886372,
"kl": 0.095458984375,
"learning_rate": 1.1646586345381526e-07,
"loss": 0.0038,
"reward": 2.6513583660125732,
"reward_std": 0.17830242216587067,
"rewards/accuracy_reward": 1.6513583660125732,
"rewards/format_reward": 1.0,
"step": 440
},
{
"completion_length": 68.921875,
"epoch": 5.313253012048193,
"grad_norm": 4.623442869387058,
"kl": 0.100830078125,
"learning_rate": 1.1445783132530119e-07,
"loss": 0.004,
"reward": 2.549654483795166,
"reward_std": 0.16079290956258774,
"rewards/accuracy_reward": 1.5574671030044556,
"rewards/format_reward": 0.9921875,
"step": 441
},
{
"completion_length": 71.3203125,
"epoch": 5.325301204819277,
"grad_norm": 5.278895722638805,
"kl": 0.10986328125,
"learning_rate": 1.1244979919678714e-07,
"loss": 0.0044,
"reward": 2.203883409500122,
"reward_std": 0.258064404129982,
"rewards/accuracy_reward": 1.2116957902908325,
"rewards/format_reward": 0.9921875,
"step": 442
},
{
"completion_length": 69.515625,
"epoch": 5.337349397590361,
"grad_norm": 4.142710717599773,
"kl": 0.113525390625,
"learning_rate": 1.1044176706827308e-07,
"loss": 0.0045,
"reward": 2.1769516468048096,
"reward_std": 0.275626465678215,
"rewards/accuracy_reward": 1.1769516468048096,
"rewards/format_reward": 1.0,
"step": 443
},
{
"completion_length": 68.3203125,
"epoch": 5.349397590361446,
"grad_norm": 4.180078412016221,
"kl": 0.147216796875,
"learning_rate": 1.0843373493975904e-07,
"loss": 0.0059,
"reward": 2.381720542907715,
"reward_std": 0.20287376642227173,
"rewards/accuracy_reward": 1.3817205429077148,
"rewards/format_reward": 1.0,
"step": 444
},
{
"completion_length": 69.7421875,
"epoch": 5.36144578313253,
"grad_norm": 3.7523897150785603,
"kl": 0.12939453125,
"learning_rate": 1.0642570281124498e-07,
"loss": 0.0052,
"reward": 2.3669261932373047,
"reward_std": 0.2056456208229065,
"rewards/accuracy_reward": 1.3747385740280151,
"rewards/format_reward": 0.9921875,
"step": 445
},
{
"completion_length": 67.7109375,
"epoch": 5.373493975903615,
"grad_norm": 4.924758819089559,
"kl": 0.185546875,
"learning_rate": 1.0441767068273092e-07,
"loss": 0.0074,
"reward": 2.4100332260131836,
"reward_std": 0.22913093864917755,
"rewards/accuracy_reward": 1.4178457260131836,
"rewards/format_reward": 0.9921875,
"step": 446
},
{
"completion_length": 69.1875,
"epoch": 5.385542168674699,
"grad_norm": 3.080626056952063,
"kl": 0.122314453125,
"learning_rate": 1.0240963855421686e-07,
"loss": 0.0049,
"reward": 2.3073067665100098,
"reward_std": 0.23586007952690125,
"rewards/accuracy_reward": 1.315119206905365,
"rewards/format_reward": 0.9921875,
"step": 447
},
{
"completion_length": 67.59375,
"epoch": 5.397590361445783,
"grad_norm": 3.8573400804993314,
"kl": 0.128662109375,
"learning_rate": 1.0040160642570281e-07,
"loss": 0.0051,
"reward": 2.2195699214935303,
"reward_std": 0.18059836328029633,
"rewards/accuracy_reward": 1.2195698618888855,
"rewards/format_reward": 1.0,
"step": 448
},
{
"completion_length": 65.0078125,
"epoch": 5.409638554216867,
"grad_norm": 9.729377045307634,
"kl": 0.110107421875,
"learning_rate": 9.839357429718875e-08,
"loss": 0.0044,
"reward": 2.335146427154541,
"reward_std": 0.20962534099817276,
"rewards/accuracy_reward": 1.3429590463638306,
"rewards/format_reward": 0.9921875,
"step": 449
},
{
"completion_length": 76.171875,
"epoch": 5.421686746987952,
"grad_norm": 5.139417091846479,
"kl": 0.17626953125,
"learning_rate": 9.638554216867469e-08,
"loss": 0.0071,
"reward": 2.2514326572418213,
"reward_std": 0.18450473248958588,
"rewards/accuracy_reward": 1.2592450976371765,
"rewards/format_reward": 0.9921875,
"step": 450
},
{
"completion_length": 68.046875,
"epoch": 5.433734939759036,
"grad_norm": 3.961385062957452,
"kl": 0.10693359375,
"learning_rate": 9.437751004016063e-08,
"loss": 0.0043,
"reward": 2.328533172607422,
"reward_std": 0.18290965259075165,
"rewards/accuracy_reward": 1.3285331726074219,
"rewards/format_reward": 1.0,
"step": 451
},
{
"completion_length": 68.6953125,
"epoch": 5.445783132530121,
"grad_norm": 4.887519681333338,
"kl": 0.103759765625,
"learning_rate": 9.236947791164657e-08,
"loss": 0.0042,
"reward": 2.3144426345825195,
"reward_std": 0.21034369617700577,
"rewards/accuracy_reward": 1.3144426941871643,
"rewards/format_reward": 1.0,
"step": 452
},
{
"completion_length": 68.0,
"epoch": 5.457831325301205,
"grad_norm": 3.80893967356862,
"kl": 0.127685546875,
"learning_rate": 9.036144578313253e-08,
"loss": 0.0051,
"reward": 2.4345412254333496,
"reward_std": 0.2006332352757454,
"rewards/accuracy_reward": 1.4345412254333496,
"rewards/format_reward": 1.0,
"step": 453
},
{
"completion_length": 67.046875,
"epoch": 5.469879518072289,
"grad_norm": 4.2954066473287815,
"kl": 0.12841796875,
"learning_rate": 8.835341365461847e-08,
"loss": 0.0052,
"reward": 2.353352427482605,
"reward_std": 0.22566306591033936,
"rewards/accuracy_reward": 1.353352427482605,
"rewards/format_reward": 1.0,
"step": 454
},
{
"completion_length": 64.8984375,
"epoch": 5.481927710843373,
"grad_norm": 4.546803918905019,
"kl": 0.1337890625,
"learning_rate": 8.634538152610442e-08,
"loss": 0.0054,
"reward": 2.3113902807235718,
"reward_std": 0.20004340261220932,
"rewards/accuracy_reward": 1.3192027807235718,
"rewards/format_reward": 0.9921875,
"step": 455
},
{
"completion_length": 66.1640625,
"epoch": 5.493975903614458,
"grad_norm": 3.5466190382737883,
"kl": 0.123046875,
"learning_rate": 8.433734939759035e-08,
"loss": 0.0049,
"reward": 2.3270002603530884,
"reward_std": 0.21506989747285843,
"rewards/accuracy_reward": 1.3270001411437988,
"rewards/format_reward": 1.0,
"step": 456
},
{
"completion_length": 72.3984375,
"epoch": 5.506024096385542,
"grad_norm": 5.213818604387868,
"kl": 0.1328125,
"learning_rate": 8.23293172690763e-08,
"loss": 0.0053,
"reward": 2.4117329120635986,
"reward_std": 0.21075783669948578,
"rewards/accuracy_reward": 1.411732792854309,
"rewards/format_reward": 1.0,
"step": 457
},
{
"completion_length": 63.4140625,
"epoch": 5.518072289156627,
"grad_norm": 4.087135154378612,
"kl": 0.1142578125,
"learning_rate": 8.032128514056224e-08,
"loss": 0.0046,
"reward": 2.2361518144607544,
"reward_std": 0.15534771978855133,
"rewards/accuracy_reward": 1.2361518740653992,
"rewards/format_reward": 1.0,
"step": 458
},
{
"completion_length": 66.6796875,
"epoch": 5.530120481927711,
"grad_norm": 3.8509871084036083,
"kl": 0.12255859375,
"learning_rate": 7.83132530120482e-08,
"loss": 0.0049,
"reward": 2.402904510498047,
"reward_std": 0.18761365860700607,
"rewards/accuracy_reward": 1.4029043912887573,
"rewards/format_reward": 1.0,
"step": 459
},
{
"completion_length": 67.921875,
"epoch": 5.542168674698795,
"grad_norm": 3.8868143152174714,
"kl": 0.1201171875,
"learning_rate": 7.630522088353414e-08,
"loss": 0.0048,
"reward": 2.202209234237671,
"reward_std": 0.20886321365833282,
"rewards/accuracy_reward": 1.2022093534469604,
"rewards/format_reward": 1.0,
"step": 460
},
{
"completion_length": 69.84375,
"epoch": 5.554216867469879,
"grad_norm": 9.828452094441177,
"kl": 0.138427734375,
"learning_rate": 7.429718875502008e-08,
"loss": 0.0055,
"reward": 2.255289673805237,
"reward_std": 0.3091956526041031,
"rewards/accuracy_reward": 1.2787271738052368,
"rewards/format_reward": 0.9765625,
"step": 461
},
{
"completion_length": 67.7265625,
"epoch": 5.566265060240964,
"grad_norm": 3.5884325923981777,
"kl": 0.14501953125,
"learning_rate": 7.228915662650602e-08,
"loss": 0.0058,
"reward": 2.389763116836548,
"reward_std": 0.1989041194319725,
"rewards/accuracy_reward": 1.3897631168365479,
"rewards/format_reward": 1.0,
"step": 462
},
{
"completion_length": 63.4765625,
"epoch": 5.578313253012048,
"grad_norm": 3.943165256338966,
"kl": 0.15185546875,
"learning_rate": 7.028112449799197e-08,
"loss": 0.0061,
"reward": 2.2263519763946533,
"reward_std": 0.22419632971286774,
"rewards/accuracy_reward": 1.2341644763946533,
"rewards/format_reward": 0.9921875,
"step": 463
},
{
"completion_length": 67.734375,
"epoch": 5.590361445783133,
"grad_norm": 8.892123036444877,
"kl": 0.126953125,
"learning_rate": 6.827309236947791e-08,
"loss": 0.0051,
"reward": 2.3126423358917236,
"reward_std": 0.17722339183092117,
"rewards/accuracy_reward": 1.3126422762870789,
"rewards/format_reward": 1.0,
"step": 464
},
{
"completion_length": 75.5546875,
"epoch": 5.602409638554217,
"grad_norm": 4.229071556328315,
"kl": 0.1240234375,
"learning_rate": 6.626506024096386e-08,
"loss": 0.005,
"reward": 2.2280049324035645,
"reward_std": 0.22474994510412216,
"rewards/accuracy_reward": 1.235817551612854,
"rewards/format_reward": 0.9921875,
"step": 465
},
{
"completion_length": 66.9609375,
"epoch": 5.614457831325301,
"grad_norm": 4.577684554062664,
"kl": 0.12451171875,
"learning_rate": 6.425702811244979e-08,
"loss": 0.005,
"reward": 2.2235909700393677,
"reward_std": 0.22441789507865906,
"rewards/accuracy_reward": 1.2392158508300781,
"rewards/format_reward": 0.984375,
"step": 466
},
{
"completion_length": 70.4375,
"epoch": 5.626506024096385,
"grad_norm": 4.349159327486559,
"kl": 0.112548828125,
"learning_rate": 6.224899598393573e-08,
"loss": 0.0045,
"reward": 2.3591808080673218,
"reward_std": 0.1966349333524704,
"rewards/accuracy_reward": 1.3669933080673218,
"rewards/format_reward": 0.9921875,
"step": 467
},
{
"completion_length": 69.4453125,
"epoch": 5.63855421686747,
"grad_norm": 3.0423100870405437,
"kl": 0.138671875,
"learning_rate": 6.024096385542168e-08,
"loss": 0.0055,
"reward": 2.4168301820755005,
"reward_std": 0.23313428461551666,
"rewards/accuracy_reward": 1.4246427416801453,
"rewards/format_reward": 0.9921875,
"step": 468
},
{
"completion_length": 67.9453125,
"epoch": 5.650602409638554,
"grad_norm": 4.8492295392656075,
"kl": 0.124755859375,
"learning_rate": 5.823293172690763e-08,
"loss": 0.005,
"reward": 2.3264076709747314,
"reward_std": 0.18676774948835373,
"rewards/accuracy_reward": 1.3264076709747314,
"rewards/format_reward": 1.0,
"step": 469
},
{
"completion_length": 68.3984375,
"epoch": 5.662650602409639,
"grad_norm": 3.7143887896006706,
"kl": 0.118896484375,
"learning_rate": 5.622489959839357e-08,
"loss": 0.0048,
"reward": 2.275146722793579,
"reward_std": 0.23441863059997559,
"rewards/accuracy_reward": 1.2907716631889343,
"rewards/format_reward": 0.984375,
"step": 470
},
{
"completion_length": 69.703125,
"epoch": 5.674698795180722,
"grad_norm": 6.421818895030251,
"kl": 0.105712890625,
"learning_rate": 5.421686746987952e-08,
"loss": 0.0042,
"reward": 2.3713172674179077,
"reward_std": 0.17046835273504257,
"rewards/accuracy_reward": 1.3713172674179077,
"rewards/format_reward": 1.0,
"step": 471
},
{
"completion_length": 71.7578125,
"epoch": 5.686746987951807,
"grad_norm": 3.7429303333646846,
"kl": 0.17333984375,
"learning_rate": 5.220883534136546e-08,
"loss": 0.0069,
"reward": 2.21248197555542,
"reward_std": 0.1897253841161728,
"rewards/accuracy_reward": 1.2202943563461304,
"rewards/format_reward": 0.9921875,
"step": 472
},
{
"completion_length": 66.0625,
"epoch": 5.698795180722891,
"grad_norm": 4.6125292648898375,
"kl": 0.1171875,
"learning_rate": 5.0200803212851406e-08,
"loss": 0.0047,
"reward": 2.3862085342407227,
"reward_std": 0.14106625318527222,
"rewards/accuracy_reward": 1.3940210938453674,
"rewards/format_reward": 0.9921875,
"step": 473
},
{
"completion_length": 71.4296875,
"epoch": 5.710843373493976,
"grad_norm": 4.192704287374918,
"kl": 0.108642578125,
"learning_rate": 4.8192771084337347e-08,
"loss": 0.0043,
"reward": 2.3476767539978027,
"reward_std": 0.20362288504838943,
"rewards/accuracy_reward": 1.3476767539978027,
"rewards/format_reward": 1.0,
"step": 474
},
{
"completion_length": 67.2109375,
"epoch": 5.72289156626506,
"grad_norm": 4.1447657242460645,
"kl": 0.1298828125,
"learning_rate": 4.618473895582329e-08,
"loss": 0.0052,
"reward": 2.266420602798462,
"reward_std": 0.2129717692732811,
"rewards/accuracy_reward": 1.2664207220077515,
"rewards/format_reward": 1.0,
"step": 475
},
{
"completion_length": 66.546875,
"epoch": 5.734939759036145,
"grad_norm": 3.4345215566799574,
"kl": 0.106201171875,
"learning_rate": 4.4176706827309234e-08,
"loss": 0.0042,
"reward": 2.352730870246887,
"reward_std": 0.1454787813127041,
"rewards/accuracy_reward": 1.3605434894561768,
"rewards/format_reward": 0.9921875,
"step": 476
},
{
"completion_length": 71.828125,
"epoch": 5.746987951807229,
"grad_norm": 4.187659893839478,
"kl": 0.111328125,
"learning_rate": 4.2168674698795174e-08,
"loss": 0.0045,
"reward": 2.2670211791992188,
"reward_std": 0.22116923332214355,
"rewards/accuracy_reward": 1.267021119594574,
"rewards/format_reward": 1.0,
"step": 477
},
{
"completion_length": 69.1875,
"epoch": 5.759036144578313,
"grad_norm": 3.8623536023281617,
"kl": 0.114013671875,
"learning_rate": 4.016064257028112e-08,
"loss": 0.0046,
"reward": 2.222132921218872,
"reward_std": 0.23479964584112167,
"rewards/accuracy_reward": 1.2221328020095825,
"rewards/format_reward": 1.0,
"step": 478
},
{
"completion_length": 70.9296875,
"epoch": 5.771084337349397,
"grad_norm": 4.262446208684037,
"kl": 0.09375,
"learning_rate": 3.815261044176707e-08,
"loss": 0.0037,
"reward": 2.2334243059158325,
"reward_std": 0.21778832376003265,
"rewards/accuracy_reward": 1.2334243059158325,
"rewards/format_reward": 1.0,
"step": 479
},
{
"completion_length": 68.2421875,
"epoch": 5.783132530120482,
"grad_norm": 3.475197673617196,
"kl": 0.10595703125,
"learning_rate": 3.614457831325301e-08,
"loss": 0.0042,
"reward": 2.4461944103240967,
"reward_std": 0.21106188744306564,
"rewards/accuracy_reward": 1.4540069103240967,
"rewards/format_reward": 0.9921875,
"step": 480
},
{
"completion_length": 70.3671875,
"epoch": 5.795180722891566,
"grad_norm": 4.56883704942929,
"kl": 0.11865234375,
"learning_rate": 3.4136546184738955e-08,
"loss": 0.0047,
"reward": 2.441108226776123,
"reward_std": 0.2091435343027115,
"rewards/accuracy_reward": 1.441108226776123,
"rewards/format_reward": 1.0,
"step": 481
},
{
"completion_length": 69.171875,
"epoch": 5.807228915662651,
"grad_norm": 3.959761896565078,
"kl": 0.12451171875,
"learning_rate": 3.2128514056224896e-08,
"loss": 0.005,
"reward": 2.3847368955612183,
"reward_std": 0.14646587148308754,
"rewards/accuracy_reward": 1.3847368359565735,
"rewards/format_reward": 1.0,
"step": 482
},
{
"completion_length": 75.3125,
"epoch": 5.8192771084337345,
"grad_norm": 4.6238410926161855,
"kl": 0.108642578125,
"learning_rate": 3.012048192771084e-08,
"loss": 0.0043,
"reward": 2.2356351613998413,
"reward_std": 0.3032216280698776,
"rewards/accuracy_reward": 1.2434476613998413,
"rewards/format_reward": 0.9921875,
"step": 483
},
{
"completion_length": 70.921875,
"epoch": 5.831325301204819,
"grad_norm": 4.963499305554948,
"kl": 0.082275390625,
"learning_rate": 2.8112449799196786e-08,
"loss": 0.0033,
"reward": 2.3230150938034058,
"reward_std": 0.16892920434474945,
"rewards/accuracy_reward": 1.3230149745941162,
"rewards/format_reward": 1.0,
"step": 484
},
{
"completion_length": 69.3359375,
"epoch": 5.843373493975903,
"grad_norm": 4.069771837808966,
"kl": 0.1396484375,
"learning_rate": 2.610441767068273e-08,
"loss": 0.0056,
"reward": 2.327863335609436,
"reward_std": 0.23238816112279892,
"rewards/accuracy_reward": 1.3434883952140808,
"rewards/format_reward": 0.984375,
"step": 485
},
{
"completion_length": 68.875,
"epoch": 5.855421686746988,
"grad_norm": 4.471391988945464,
"kl": 0.13330078125,
"learning_rate": 2.4096385542168673e-08,
"loss": 0.0053,
"reward": 2.331111192703247,
"reward_std": 0.1987084299325943,
"rewards/accuracy_reward": 1.3389237523078918,
"rewards/format_reward": 0.9921875,
"step": 486
},
{
"completion_length": 72.2734375,
"epoch": 5.867469879518072,
"grad_norm": 4.3661266337784514,
"kl": 0.128173828125,
"learning_rate": 2.2088353413654617e-08,
"loss": 0.0051,
"reward": 2.2740135192871094,
"reward_std": 0.17679665982723236,
"rewards/accuracy_reward": 1.2740132808685303,
"rewards/format_reward": 1.0,
"step": 487
},
{
"completion_length": 69.328125,
"epoch": 5.879518072289157,
"grad_norm": 4.78815312664634,
"kl": 0.150634765625,
"learning_rate": 2.008032128514056e-08,
"loss": 0.006,
"reward": 2.2422866821289062,
"reward_std": 0.23693696409463882,
"rewards/accuracy_reward": 1.2422866821289062,
"rewards/format_reward": 1.0,
"step": 488
},
{
"completion_length": 71.4140625,
"epoch": 5.891566265060241,
"grad_norm": 6.245102077972556,
"kl": 0.121826171875,
"learning_rate": 1.8072289156626504e-08,
"loss": 0.0049,
"reward": 2.315194010734558,
"reward_std": 0.1885218769311905,
"rewards/accuracy_reward": 1.3230066299438477,
"rewards/format_reward": 0.9921875,
"step": 489
},
{
"completion_length": 63.8984375,
"epoch": 5.903614457831325,
"grad_norm": 4.510763484461414,
"kl": 0.122314453125,
"learning_rate": 1.6064257028112448e-08,
"loss": 0.0049,
"reward": 2.3149102926254272,
"reward_std": 0.1639706939458847,
"rewards/accuracy_reward": 1.3149102926254272,
"rewards/format_reward": 1.0,
"step": 490
},
{
"completion_length": 66.0,
"epoch": 5.9156626506024095,
"grad_norm": 4.091329557372317,
"kl": 0.1435546875,
"learning_rate": 1.4056224899598393e-08,
"loss": 0.0058,
"reward": 2.4370064735412598,
"reward_std": 0.15971215814352036,
"rewards/accuracy_reward": 1.4370064735412598,
"rewards/format_reward": 1.0,
"step": 491
},
{
"completion_length": 70.484375,
"epoch": 5.927710843373494,
"grad_norm": 4.3856574896033305,
"kl": 0.155029296875,
"learning_rate": 1.2048192771084337e-08,
"loss": 0.0062,
"reward": 2.351839542388916,
"reward_std": 0.2616487815976143,
"rewards/accuracy_reward": 1.359652042388916,
"rewards/format_reward": 0.9921875,
"step": 492
},
{
"completion_length": 74.171875,
"epoch": 5.9397590361445785,
"grad_norm": 3.3373281083458974,
"kl": 0.107177734375,
"learning_rate": 1.004016064257028e-08,
"loss": 0.0043,
"reward": 2.3034894466400146,
"reward_std": 0.12144535779953003,
"rewards/accuracy_reward": 1.3113019466400146,
"rewards/format_reward": 0.9921875,
"step": 493
},
{
"completion_length": 72.8515625,
"epoch": 5.951807228915663,
"grad_norm": 3.3157754210190773,
"kl": 0.097412109375,
"learning_rate": 8.032128514056224e-09,
"loss": 0.0039,
"reward": 2.421133041381836,
"reward_std": 0.16620434820652008,
"rewards/accuracy_reward": 1.421133041381836,
"rewards/format_reward": 1.0,
"step": 494
},
{
"completion_length": 76.1328125,
"epoch": 5.9638554216867465,
"grad_norm": 3.788575194538334,
"kl": 0.12158203125,
"learning_rate": 6.024096385542168e-09,
"loss": 0.0049,
"reward": 2.3588104248046875,
"reward_std": 0.1766229048371315,
"rewards/accuracy_reward": 1.358810544013977,
"rewards/format_reward": 1.0,
"step": 495
},
{
"completion_length": 71.515625,
"epoch": 5.975903614457831,
"grad_norm": 4.2730966058785835,
"kl": 0.11962890625,
"learning_rate": 4.016064257028112e-09,
"loss": 0.0048,
"reward": 2.3155951499938965,
"reward_std": 0.25304850190877914,
"rewards/accuracy_reward": 1.3234076499938965,
"rewards/format_reward": 0.9921875,
"step": 496
},
{
"completion_length": 68.859375,
"epoch": 5.9879518072289155,
"grad_norm": 4.371956801820215,
"kl": 0.119140625,
"learning_rate": 2.008032128514056e-09,
"loss": 0.0048,
"reward": 2.3737374544143677,
"reward_std": 0.20605729520320892,
"rewards/accuracy_reward": 1.373737394809723,
"rewards/format_reward": 1.0,
"step": 497
},
{
"completion_length": 60.75000190734863,
"epoch": 6.0,
"grad_norm": 3.9720317304626964,
"kl": 0.1171875,
"learning_rate": 0.0,
"loss": 0.0046,
"reward": 2.4247955083847046,
"reward_std": 0.17968511581420898,
"rewards/accuracy_reward": 1.4247953295707703,
"rewards/format_reward": 1.0,
"step": 498
}
],
"logging_steps": 1.0,
"max_steps": 498,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}