{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 498, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 82.65625, "epoch": 0.012048192771084338, "grad_norm": 5.841508927710052, "kl": 0.0, "learning_rate": 9.97991967871486e-07, "loss": 0.0, "reward": 1.4489864706993103, "reward_std": 0.8421240150928497, "rewards/accuracy_reward": 0.8005490005016327, "rewards/format_reward": 0.6484375, "step": 1 }, { "completion_length": 91.453125, "epoch": 0.024096385542168676, "grad_norm": 4.392637703815363, "kl": 0.00279998779296875, "learning_rate": 9.959839357429717e-07, "loss": 0.0001, "reward": 1.3076424598693848, "reward_std": 0.8380775451660156, "rewards/accuracy_reward": 0.6123300492763519, "rewards/format_reward": 0.6953125, "step": 2 }, { "completion_length": 79.171875, "epoch": 0.03614457831325301, "grad_norm": 5.134937236220538, "kl": 0.009063720703125, "learning_rate": 9.93975903614458e-07, "loss": 0.0004, "reward": 1.650797963142395, "reward_std": 0.8256142735481262, "rewards/accuracy_reward": 0.8773605227470398, "rewards/format_reward": 0.7734375, "step": 3 }, { "completion_length": 90.8671875, "epoch": 0.04819277108433735, "grad_norm": 4.181043208735878, "kl": 0.0099029541015625, "learning_rate": 9.919678714859437e-07, "loss": 0.0004, "reward": 1.4978268146514893, "reward_std": 0.7668428122997284, "rewards/accuracy_reward": 0.6618892848491669, "rewards/format_reward": 0.8359375, "step": 4 }, { "completion_length": 83.15625, "epoch": 0.060240963855421686, "grad_norm": 4.623169300333461, "kl": 0.028106689453125, "learning_rate": 9.899598393574296e-07, "loss": 0.0011, "reward": 1.959537386894226, "reward_std": 0.6147363781929016, "rewards/accuracy_reward": 1.0532873272895813, "rewards/format_reward": 0.90625, "step": 5 }, { "completion_length": 75.1484375, "epoch": 0.07228915662650602, "grad_norm": 5.568012410409197, "kl": 0.03021240234375, "learning_rate": 9.879518072289156e-07, "loss": 0.0012, "reward": 2.047786593437195, "reward_std": 0.4053535610437393, "rewards/accuracy_reward": 1.0946615934371948, "rewards/format_reward": 0.953125, "step": 6 }, { "completion_length": 76.03125, "epoch": 0.08433734939759036, "grad_norm": 4.7579852016782045, "kl": 0.033935546875, "learning_rate": 9.859437751004016e-07, "loss": 0.0014, "reward": 2.1630080938339233, "reward_std": 0.3877447098493576, "rewards/accuracy_reward": 1.2333204746246338, "rewards/format_reward": 0.9296875, "step": 7 }, { "completion_length": 71.546875, "epoch": 0.0963855421686747, "grad_norm": 9.256093312505593, "kl": 0.244384765625, "learning_rate": 9.839357429718876e-07, "loss": 0.0097, "reward": 2.015242576599121, "reward_std": 0.4337102472782135, "rewards/accuracy_reward": 1.054305076599121, "rewards/format_reward": 0.9609375, "step": 8 }, { "completion_length": 72.1796875, "epoch": 0.10843373493975904, "grad_norm": 9.959610046323814, "kl": 0.2841796875, "learning_rate": 9.819277108433734e-07, "loss": 0.0114, "reward": 1.9989103078842163, "reward_std": 0.38074547052383423, "rewards/accuracy_reward": 1.0145351886749268, "rewards/format_reward": 0.984375, "step": 9 }, { "completion_length": 67.0078125, "epoch": 0.12048192771084337, "grad_norm": 4.494217301954794, "kl": 0.0677490234375, "learning_rate": 9.799196787148593e-07, "loss": 0.0027, "reward": 2.208647847175598, "reward_std": 0.20472895354032516, "rewards/accuracy_reward": 1.2086476683616638, "rewards/format_reward": 1.0, "step": 10 }, { "completion_length": 66.3125, "epoch": 0.13253012048192772, "grad_norm": 4.205085729740715, "kl": 0.111083984375, "learning_rate": 9.779116465863453e-07, "loss": 0.0044, "reward": 2.016738772392273, "reward_std": 0.39626075327396393, "rewards/accuracy_reward": 1.0323637425899506, "rewards/format_reward": 0.984375, "step": 11 }, { "completion_length": 64.2265625, "epoch": 0.14457831325301204, "grad_norm": 5.285643902891126, "kl": 0.0670166015625, "learning_rate": 9.759036144578313e-07, "loss": 0.0027, "reward": 2.0809445977211, "reward_std": 0.3285638391971588, "rewards/accuracy_reward": 1.080944538116455, "rewards/format_reward": 1.0, "step": 12 }, { "completion_length": 57.7265625, "epoch": 0.1566265060240964, "grad_norm": 5.332797970620105, "kl": 0.07958984375, "learning_rate": 9.738955823293173e-07, "loss": 0.0032, "reward": 2.1677627563476562, "reward_std": 0.32235731184482574, "rewards/accuracy_reward": 1.1677626371383667, "rewards/format_reward": 1.0, "step": 13 }, { "completion_length": 62.765625, "epoch": 0.1686746987951807, "grad_norm": 7.594424067233083, "kl": 0.086181640625, "learning_rate": 9.718875502008033e-07, "loss": 0.0034, "reward": 2.287484049797058, "reward_std": 0.2577601447701454, "rewards/accuracy_reward": 1.3031091094017029, "rewards/format_reward": 0.984375, "step": 14 }, { "completion_length": 61.28125, "epoch": 0.18072289156626506, "grad_norm": 6.602361615736723, "kl": 0.087890625, "learning_rate": 9.69879518072289e-07, "loss": 0.0035, "reward": 2.28032910823822, "reward_std": 0.38463760912418365, "rewards/accuracy_reward": 1.2881416082382202, "rewards/format_reward": 0.9921875, "step": 15 }, { "completion_length": 63.6796875, "epoch": 0.1927710843373494, "grad_norm": 4.1986480450121135, "kl": 0.078125, "learning_rate": 9.67871485943775e-07, "loss": 0.0031, "reward": 2.1277613639831543, "reward_std": 0.2963729351758957, "rewards/accuracy_reward": 1.1433865427970886, "rewards/format_reward": 0.984375, "step": 16 }, { "completion_length": 60.65625, "epoch": 0.20481927710843373, "grad_norm": 6.921299965436032, "kl": 0.088134765625, "learning_rate": 9.65863453815261e-07, "loss": 0.0035, "reward": 2.157727599143982, "reward_std": 0.30868735909461975, "rewards/accuracy_reward": 1.1733525395393372, "rewards/format_reward": 0.984375, "step": 17 }, { "completion_length": 59.2265625, "epoch": 0.21686746987951808, "grad_norm": 4.904213548043611, "kl": 0.07666015625, "learning_rate": 9.63855421686747e-07, "loss": 0.0031, "reward": 2.24626088142395, "reward_std": 0.22766248881816864, "rewards/accuracy_reward": 1.2540735006332397, "rewards/format_reward": 0.9921875, "step": 18 }, { "completion_length": 58.703125, "epoch": 0.2289156626506024, "grad_norm": 4.786279154756674, "kl": 0.109619140625, "learning_rate": 9.61847389558233e-07, "loss": 0.0044, "reward": 2.050855040550232, "reward_std": 0.35161878168582916, "rewards/accuracy_reward": 1.0586674511432648, "rewards/format_reward": 0.9921875, "step": 19 }, { "completion_length": 58.109375, "epoch": 0.24096385542168675, "grad_norm": 4.05967579782597, "kl": 0.08056640625, "learning_rate": 9.598393574297187e-07, "loss": 0.0032, "reward": 2.20633327960968, "reward_std": 0.3129453659057617, "rewards/accuracy_reward": 1.2219581604003906, "rewards/format_reward": 0.984375, "step": 20 }, { "completion_length": 57.71875, "epoch": 0.25301204819277107, "grad_norm": 5.8300935596675885, "kl": 0.080078125, "learning_rate": 9.57831325301205e-07, "loss": 0.0032, "reward": 2.417273759841919, "reward_std": 0.28760989010334015, "rewards/accuracy_reward": 1.4250862002372742, "rewards/format_reward": 0.9921875, "step": 21 }, { "completion_length": 54.5859375, "epoch": 0.26506024096385544, "grad_norm": 7.535044861581114, "kl": 0.106201171875, "learning_rate": 9.558232931726907e-07, "loss": 0.0042, "reward": 2.2527129650115967, "reward_std": 0.2951706647872925, "rewards/accuracy_reward": 1.2683378458023071, "rewards/format_reward": 0.984375, "step": 22 }, { "completion_length": 61.09375, "epoch": 0.27710843373493976, "grad_norm": 4.416172924233661, "kl": 0.10009765625, "learning_rate": 9.538152610441766e-07, "loss": 0.004, "reward": 2.1894314289093018, "reward_std": 0.21257736533880234, "rewards/accuracy_reward": 1.1894314289093018, "rewards/format_reward": 1.0, "step": 23 }, { "completion_length": 54.9921875, "epoch": 0.2891566265060241, "grad_norm": 4.553446996976198, "kl": 0.09814453125, "learning_rate": 9.518072289156625e-07, "loss": 0.0039, "reward": 2.3037142753601074, "reward_std": 0.3323938250541687, "rewards/accuracy_reward": 1.3115268349647522, "rewards/format_reward": 0.9921875, "step": 24 }, { "completion_length": 55.9921875, "epoch": 0.30120481927710846, "grad_norm": 8.671383785487564, "kl": 0.120849609375, "learning_rate": 9.497991967871486e-07, "loss": 0.0048, "reward": 2.239556074142456, "reward_std": 0.3447880446910858, "rewards/accuracy_reward": 1.2551808953285217, "rewards/format_reward": 0.984375, "step": 25 }, { "completion_length": 58.7890625, "epoch": 0.3132530120481928, "grad_norm": 8.322624639517006, "kl": 0.12353515625, "learning_rate": 9.477911646586345e-07, "loss": 0.0049, "reward": 2.2209770679473877, "reward_std": 0.3139883056282997, "rewards/accuracy_reward": 1.2287896275520325, "rewards/format_reward": 0.9921875, "step": 26 }, { "completion_length": 56.2421875, "epoch": 0.3253012048192771, "grad_norm": 20.55146941012377, "kl": 0.130126953125, "learning_rate": 9.457831325301205e-07, "loss": 0.0052, "reward": 2.344720959663391, "reward_std": 0.25742725282907486, "rewards/accuracy_reward": 1.3525334596633911, "rewards/format_reward": 0.9921875, "step": 27 }, { "completion_length": 52.3671875, "epoch": 0.3373493975903614, "grad_norm": 4.550988243582887, "kl": 0.12548828125, "learning_rate": 9.437751004016063e-07, "loss": 0.005, "reward": 2.407941460609436, "reward_std": 0.3139786869287491, "rewards/accuracy_reward": 1.4313790798187256, "rewards/format_reward": 0.9765625, "step": 28 }, { "completion_length": 53.328125, "epoch": 0.3493975903614458, "grad_norm": 5.133796660962732, "kl": 0.1435546875, "learning_rate": 9.417670682730924e-07, "loss": 0.0057, "reward": 2.3306795358657837, "reward_std": 0.3039723336696625, "rewards/accuracy_reward": 1.3463045954704285, "rewards/format_reward": 0.984375, "step": 29 }, { "completion_length": 53.8125, "epoch": 0.3614457831325301, "grad_norm": 6.796717577260548, "kl": 0.27880859375, "learning_rate": 9.397590361445783e-07, "loss": 0.0112, "reward": 2.2834625244140625, "reward_std": 0.3063512295484543, "rewards/accuracy_reward": 1.2834625244140625, "rewards/format_reward": 1.0, "step": 30 }, { "completion_length": 56.3203125, "epoch": 0.37349397590361444, "grad_norm": 4.3393989853337285, "kl": 0.14794921875, "learning_rate": 9.377510040160642e-07, "loss": 0.0059, "reward": 2.354575991630554, "reward_std": 0.314766064286232, "rewards/accuracy_reward": 1.3623886704444885, "rewards/format_reward": 0.9921875, "step": 31 }, { "completion_length": 54.171875, "epoch": 0.3855421686746988, "grad_norm": 4.279946209704863, "kl": 0.197265625, "learning_rate": 9.357429718875502e-07, "loss": 0.0079, "reward": 2.1385136246681213, "reward_std": 0.24586574733257294, "rewards/accuracy_reward": 1.1463261544704437, "rewards/format_reward": 0.9921875, "step": 32 }, { "completion_length": 51.4140625, "epoch": 0.39759036144578314, "grad_norm": 5.88762957444806, "kl": 0.1630859375, "learning_rate": 9.33734939759036e-07, "loss": 0.0065, "reward": 2.2907108068466187, "reward_std": 0.25231631100177765, "rewards/accuracy_reward": 1.2907109260559082, "rewards/format_reward": 1.0, "step": 33 }, { "completion_length": 50.4609375, "epoch": 0.40963855421686746, "grad_norm": 5.469228934242547, "kl": 0.16845703125, "learning_rate": 9.317269076305221e-07, "loss": 0.0067, "reward": 2.2533600330352783, "reward_std": 0.25808002054691315, "rewards/accuracy_reward": 1.2611725330352783, "rewards/format_reward": 0.9921875, "step": 34 }, { "completion_length": 47.84375, "epoch": 0.42168674698795183, "grad_norm": 5.412602747215773, "kl": 0.177734375, "learning_rate": 9.29718875502008e-07, "loss": 0.0071, "reward": 2.3132054805755615, "reward_std": 0.2454073503613472, "rewards/accuracy_reward": 1.3132054805755615, "rewards/format_reward": 1.0, "step": 35 }, { "completion_length": 44.21875, "epoch": 0.43373493975903615, "grad_norm": 5.190368238545804, "kl": 0.2275390625, "learning_rate": 9.27710843373494e-07, "loss": 0.0091, "reward": 2.2854232788085938, "reward_std": 0.29085223376750946, "rewards/accuracy_reward": 1.293235719203949, "rewards/format_reward": 0.9921875, "step": 36 }, { "completion_length": 48.71875, "epoch": 0.4457831325301205, "grad_norm": 4.780274291960778, "kl": 0.20751953125, "learning_rate": 9.257028112449798e-07, "loss": 0.0083, "reward": 2.246184825897217, "reward_std": 0.31601477414369583, "rewards/accuracy_reward": 1.261809766292572, "rewards/format_reward": 0.984375, "step": 37 }, { "completion_length": 42.265625, "epoch": 0.4578313253012048, "grad_norm": 6.234590681750942, "kl": 0.265625, "learning_rate": 9.236947791164659e-07, "loss": 0.0106, "reward": 2.112604260444641, "reward_std": 0.30199334025382996, "rewards/accuracy_reward": 1.1126042604446411, "rewards/format_reward": 1.0, "step": 38 }, { "completion_length": 45.1015625, "epoch": 0.46987951807228917, "grad_norm": 4.611394363412455, "kl": 0.15576171875, "learning_rate": 9.216867469879518e-07, "loss": 0.0062, "reward": 2.3590028285980225, "reward_std": 0.2973439395427704, "rewards/accuracy_reward": 1.3746278285980225, "rewards/format_reward": 0.984375, "step": 39 }, { "completion_length": 45.3046875, "epoch": 0.4819277108433735, "grad_norm": 6.117578716606278, "kl": 0.17626953125, "learning_rate": 9.196787148594377e-07, "loss": 0.0071, "reward": 2.2271867990493774, "reward_std": 0.22323830425739288, "rewards/accuracy_reward": 1.234999418258667, "rewards/format_reward": 0.9921875, "step": 40 }, { "completion_length": 41.9453125, "epoch": 0.4939759036144578, "grad_norm": 4.858430237306144, "kl": 0.2236328125, "learning_rate": 9.176706827309237e-07, "loss": 0.0089, "reward": 2.217424750328064, "reward_std": 0.2663164809346199, "rewards/accuracy_reward": 1.2252373099327087, "rewards/format_reward": 0.9921875, "step": 41 }, { "completion_length": 41.0234375, "epoch": 0.5060240963855421, "grad_norm": 4.127212546225013, "kl": 0.18212890625, "learning_rate": 9.156626506024095e-07, "loss": 0.0073, "reward": 2.16755473613739, "reward_std": 0.3387562334537506, "rewards/accuracy_reward": 1.1753671169281006, "rewards/format_reward": 0.9921875, "step": 42 }, { "completion_length": 42.6640625, "epoch": 0.5180722891566265, "grad_norm": 5.226665280180925, "kl": 0.23193359375, "learning_rate": 9.136546184738956e-07, "loss": 0.0093, "reward": 2.203770875930786, "reward_std": 0.3409430831670761, "rewards/accuracy_reward": 1.2350206971168518, "rewards/format_reward": 0.96875, "step": 43 }, { "completion_length": 40.9609375, "epoch": 0.5301204819277109, "grad_norm": 4.308668359699942, "kl": 0.134033203125, "learning_rate": 9.116465863453815e-07, "loss": 0.0054, "reward": 2.2817225456237793, "reward_std": 0.19574209302663803, "rewards/accuracy_reward": 1.281722605228424, "rewards/format_reward": 1.0, "step": 44 }, { "completion_length": 38.7734375, "epoch": 0.5421686746987951, "grad_norm": 6.033974360622575, "kl": 0.13232421875, "learning_rate": 9.096385542168675e-07, "loss": 0.0053, "reward": 2.2139052152633667, "reward_std": 0.28486668318510056, "rewards/accuracy_reward": 1.2451552748680115, "rewards/format_reward": 0.96875, "step": 45 }, { "completion_length": 41.1484375, "epoch": 0.5542168674698795, "grad_norm": 5.314865555502224, "kl": 0.11279296875, "learning_rate": 9.076305220883533e-07, "loss": 0.0045, "reward": 2.4188212156295776, "reward_std": 0.2556447684764862, "rewards/accuracy_reward": 1.4266336560249329, "rewards/format_reward": 0.9921875, "step": 46 }, { "completion_length": 42.7109375, "epoch": 0.5662650602409639, "grad_norm": 3.687080063413381, "kl": 0.123046875, "learning_rate": 9.056224899598393e-07, "loss": 0.0049, "reward": 2.2985291481018066, "reward_std": 0.2858593165874481, "rewards/accuracy_reward": 1.3063417077064514, "rewards/format_reward": 0.9921875, "step": 47 }, { "completion_length": 46.859375, "epoch": 0.5783132530120482, "grad_norm": 4.277184476359137, "kl": 0.20166015625, "learning_rate": 9.036144578313253e-07, "loss": 0.0081, "reward": 2.1704814434051514, "reward_std": 0.3619203567504883, "rewards/accuracy_reward": 1.186106562614441, "rewards/format_reward": 0.984375, "step": 48 }, { "completion_length": 45.21875, "epoch": 0.5903614457831325, "grad_norm": 3.7971557376020577, "kl": 0.124267578125, "learning_rate": 9.016064257028112e-07, "loss": 0.005, "reward": 2.1000068187713623, "reward_std": 0.2924596816301346, "rewards/accuracy_reward": 1.123444378376007, "rewards/format_reward": 0.9765625, "step": 49 }, { "completion_length": 44.7734375, "epoch": 0.6024096385542169, "grad_norm": 4.458817172061971, "kl": 0.111083984375, "learning_rate": 8.995983935742972e-07, "loss": 0.0044, "reward": 2.2635247707366943, "reward_std": 0.3522821515798569, "rewards/accuracy_reward": 1.2869621515274048, "rewards/format_reward": 0.9765625, "step": 50 }, { "completion_length": 51.5859375, "epoch": 0.6144578313253012, "grad_norm": 5.351600002967812, "kl": 0.115234375, "learning_rate": 8.97590361445783e-07, "loss": 0.0046, "reward": 2.321009397506714, "reward_std": 0.23405297100543976, "rewards/accuracy_reward": 1.3366344571113586, "rewards/format_reward": 0.984375, "step": 51 }, { "completion_length": 50.421875, "epoch": 0.6265060240963856, "grad_norm": 4.213335817741083, "kl": 0.1396484375, "learning_rate": 8.955823293172691e-07, "loss": 0.0056, "reward": 2.3553450107574463, "reward_std": 0.25443293899297714, "rewards/accuracy_reward": 1.3944076299667358, "rewards/format_reward": 0.9609375, "step": 52 }, { "completion_length": 60.6015625, "epoch": 0.6385542168674698, "grad_norm": 6.123689334744157, "kl": 0.121337890625, "learning_rate": 8.93574297188755e-07, "loss": 0.0049, "reward": 2.112071990966797, "reward_std": 0.30149899423122406, "rewards/accuracy_reward": 1.1433220505714417, "rewards/format_reward": 0.96875, "step": 53 }, { "completion_length": 50.0703125, "epoch": 0.6506024096385542, "grad_norm": 4.396654754831157, "kl": 0.1337890625, "learning_rate": 8.915662650602409e-07, "loss": 0.0053, "reward": 2.233729839324951, "reward_std": 0.23247240483760834, "rewards/accuracy_reward": 1.2571672797203064, "rewards/format_reward": 0.9765625, "step": 54 }, { "completion_length": 60.2890625, "epoch": 0.6626506024096386, "grad_norm": 7.03985835954293, "kl": 0.10498046875, "learning_rate": 8.895582329317268e-07, "loss": 0.0042, "reward": 2.196902871131897, "reward_std": 0.2882121652364731, "rewards/accuracy_reward": 1.2125278115272522, "rewards/format_reward": 0.984375, "step": 55 }, { "completion_length": 50.640625, "epoch": 0.6746987951807228, "grad_norm": 4.86896494949543, "kl": 0.12451171875, "learning_rate": 8.875502008032128e-07, "loss": 0.005, "reward": 2.171112537384033, "reward_std": 0.16461243480443954, "rewards/accuracy_reward": 1.1867375373840332, "rewards/format_reward": 0.984375, "step": 56 }, { "completion_length": 53.21875, "epoch": 0.6867469879518072, "grad_norm": 3.557538165261062, "kl": 0.1240234375, "learning_rate": 8.855421686746988e-07, "loss": 0.005, "reward": 2.2328275442123413, "reward_std": 0.2752218544483185, "rewards/accuracy_reward": 1.2406402230262756, "rewards/format_reward": 0.9921875, "step": 57 }, { "completion_length": 47.8671875, "epoch": 0.6987951807228916, "grad_norm": 5.180162989820259, "kl": 0.125, "learning_rate": 8.835341365461847e-07, "loss": 0.005, "reward": 2.2453041076660156, "reward_std": 0.315682128071785, "rewards/accuracy_reward": 1.268741488456726, "rewards/format_reward": 0.9765625, "step": 58 }, { "completion_length": 57.9765625, "epoch": 0.7108433734939759, "grad_norm": 3.899105782667564, "kl": 0.10205078125, "learning_rate": 8.815261044176707e-07, "loss": 0.0041, "reward": 2.284543514251709, "reward_std": 0.25333235412836075, "rewards/accuracy_reward": 1.292356252670288, "rewards/format_reward": 0.9921875, "step": 59 }, { "completion_length": 46.5859375, "epoch": 0.7228915662650602, "grad_norm": 13.765129472909528, "kl": 0.106201171875, "learning_rate": 8.795180722891565e-07, "loss": 0.0042, "reward": 2.113099694252014, "reward_std": 0.326066330075264, "rewards/accuracy_reward": 1.1287246942520142, "rewards/format_reward": 0.984375, "step": 60 }, { "completion_length": 46.375, "epoch": 0.7349397590361446, "grad_norm": 6.1270425433473, "kl": 0.16357421875, "learning_rate": 8.775100401606425e-07, "loss": 0.0065, "reward": 1.9968695640563965, "reward_std": 0.34320104122161865, "rewards/accuracy_reward": 1.0124945640563965, "rewards/format_reward": 0.984375, "step": 61 }, { "completion_length": 53.09375, "epoch": 0.7469879518072289, "grad_norm": 4.3056291481606745, "kl": 0.1513671875, "learning_rate": 8.755020080321285e-07, "loss": 0.0061, "reward": 2.1780970096588135, "reward_std": 0.2706674858927727, "rewards/accuracy_reward": 1.2093469500541687, "rewards/format_reward": 0.96875, "step": 62 }, { "completion_length": 55.9375, "epoch": 0.7590361445783133, "grad_norm": 3.2395174572422416, "kl": 0.14501953125, "learning_rate": 8.734939759036144e-07, "loss": 0.0058, "reward": 2.1430922746658325, "reward_std": 0.24412654340267181, "rewards/accuracy_reward": 1.1665297150611877, "rewards/format_reward": 0.9765625, "step": 63 }, { "completion_length": 56.6328125, "epoch": 0.7710843373493976, "grad_norm": 4.190814109425291, "kl": 0.11962890625, "learning_rate": 8.714859437751003e-07, "loss": 0.0048, "reward": 2.1700193881988525, "reward_std": 0.2942150831222534, "rewards/accuracy_reward": 1.1934569478034973, "rewards/format_reward": 0.9765625, "step": 64 }, { "completion_length": 64.3984375, "epoch": 0.7831325301204819, "grad_norm": 3.226137200230793, "kl": 0.102783203125, "learning_rate": 8.694779116465863e-07, "loss": 0.0041, "reward": 2.2898290157318115, "reward_std": 0.2443845123052597, "rewards/accuracy_reward": 1.3132665753364563, "rewards/format_reward": 0.9765625, "step": 65 }, { "completion_length": 67.7109375, "epoch": 0.7951807228915663, "grad_norm": 3.9157620361816314, "kl": 0.0927734375, "learning_rate": 8.674698795180723e-07, "loss": 0.0037, "reward": 2.161790609359741, "reward_std": 0.29590657353401184, "rewards/accuracy_reward": 1.1696029901504517, "rewards/format_reward": 0.9921875, "step": 66 }, { "completion_length": 74.3203125, "epoch": 0.8072289156626506, "grad_norm": 3.1212414712368375, "kl": 0.082763671875, "learning_rate": 8.654618473895582e-07, "loss": 0.0033, "reward": 2.215745210647583, "reward_std": 0.2766411006450653, "rewards/accuracy_reward": 1.2313700914382935, "rewards/format_reward": 0.984375, "step": 67 }, { "completion_length": 74.0390625, "epoch": 0.8192771084337349, "grad_norm": 3.446969302283755, "kl": 0.074951171875, "learning_rate": 8.634538152610441e-07, "loss": 0.003, "reward": 2.1964612007141113, "reward_std": 0.235237754881382, "rewards/accuracy_reward": 1.2198986411094666, "rewards/format_reward": 0.9765625, "step": 68 }, { "completion_length": 76.9375, "epoch": 0.8313253012048193, "grad_norm": 3.310962519125171, "kl": 0.08154296875, "learning_rate": 8.614457831325301e-07, "loss": 0.0033, "reward": 2.1269989013671875, "reward_std": 0.2448011264204979, "rewards/accuracy_reward": 1.1426239013671875, "rewards/format_reward": 0.984375, "step": 69 }, { "completion_length": 71.3984375, "epoch": 0.8433734939759037, "grad_norm": 3.2998576155248966, "kl": 0.0888671875, "learning_rate": 8.59437751004016e-07, "loss": 0.0036, "reward": 2.2479825019836426, "reward_std": 0.2886482775211334, "rewards/accuracy_reward": 1.2636074423789978, "rewards/format_reward": 0.984375, "step": 70 }, { "completion_length": 72.1484375, "epoch": 0.8554216867469879, "grad_norm": 7.668000907111886, "kl": 0.07861328125, "learning_rate": 8.57429718875502e-07, "loss": 0.0031, "reward": 2.2247371673583984, "reward_std": 0.2391326129436493, "rewards/accuracy_reward": 1.2637996673583984, "rewards/format_reward": 0.9609375, "step": 71 }, { "completion_length": 77.7734375, "epoch": 0.8674698795180723, "grad_norm": 3.4104191137958013, "kl": 0.068359375, "learning_rate": 8.554216867469879e-07, "loss": 0.0027, "reward": 2.2031702995300293, "reward_std": 0.21321924775838852, "rewards/accuracy_reward": 1.210982859134674, "rewards/format_reward": 0.9921875, "step": 72 }, { "completion_length": 76.5546875, "epoch": 0.8795180722891566, "grad_norm": 3.884229840630286, "kl": 0.0947265625, "learning_rate": 8.534136546184738e-07, "loss": 0.0038, "reward": 2.2307136058807373, "reward_std": 0.2959597185254097, "rewards/accuracy_reward": 1.2463387250900269, "rewards/format_reward": 0.984375, "step": 73 }, { "completion_length": 73.7265625, "epoch": 0.891566265060241, "grad_norm": 7.2397255809983525, "kl": 0.170654296875, "learning_rate": 8.514056224899598e-07, "loss": 0.0068, "reward": 2.311343193054199, "reward_std": 0.21377335488796234, "rewards/accuracy_reward": 1.319155752658844, "rewards/format_reward": 0.9921875, "step": 74 }, { "completion_length": 71.5859375, "epoch": 0.9036144578313253, "grad_norm": 3.397020763244455, "kl": 0.073974609375, "learning_rate": 8.493975903614458e-07, "loss": 0.003, "reward": 2.3479005098342896, "reward_std": 0.2722414582967758, "rewards/accuracy_reward": 1.3713379502296448, "rewards/format_reward": 0.9765625, "step": 75 }, { "completion_length": 64.34375, "epoch": 0.9156626506024096, "grad_norm": 4.709358727325993, "kl": 0.116455078125, "learning_rate": 8.473895582329317e-07, "loss": 0.0047, "reward": 2.1038066148757935, "reward_std": 0.3149692267179489, "rewards/accuracy_reward": 1.158493995666504, "rewards/format_reward": 0.9453125, "step": 76 }, { "completion_length": 69.390625, "epoch": 0.927710843373494, "grad_norm": 3.3768601117352923, "kl": 0.11376953125, "learning_rate": 8.453815261044176e-07, "loss": 0.0046, "reward": 2.02778023481369, "reward_std": 0.3105141818523407, "rewards/accuracy_reward": 1.074655294418335, "rewards/format_reward": 0.953125, "step": 77 }, { "completion_length": 67.328125, "epoch": 0.9397590361445783, "grad_norm": 3.504578270706009, "kl": 0.115234375, "learning_rate": 8.433734939759036e-07, "loss": 0.0046, "reward": 2.194709539413452, "reward_std": 0.27273692935705185, "rewards/accuracy_reward": 1.2181469202041626, "rewards/format_reward": 0.9765625, "step": 78 }, { "completion_length": 75.1640625, "epoch": 0.9518072289156626, "grad_norm": 4.043012399812061, "kl": 0.123046875, "learning_rate": 8.413654618473895e-07, "loss": 0.0049, "reward": 2.13509202003479, "reward_std": 0.313528910279274, "rewards/accuracy_reward": 1.18196702003479, "rewards/format_reward": 0.953125, "step": 79 }, { "completion_length": 70.0234375, "epoch": 0.963855421686747, "grad_norm": 4.870660538899373, "kl": 0.086181640625, "learning_rate": 8.393574297188755e-07, "loss": 0.0035, "reward": 2.1953389644622803, "reward_std": 0.26908765733242035, "rewards/accuracy_reward": 1.2265888452529907, "rewards/format_reward": 0.96875, "step": 80 }, { "completion_length": 80.859375, "epoch": 0.9759036144578314, "grad_norm": 3.8261245848047065, "kl": 0.1015625, "learning_rate": 8.373493975903614e-07, "loss": 0.0041, "reward": 2.0212653279304504, "reward_std": 0.3835397958755493, "rewards/accuracy_reward": 1.0915777683258057, "rewards/format_reward": 0.9296875, "step": 81 }, { "completion_length": 74.046875, "epoch": 0.9879518072289156, "grad_norm": 4.0964460767880535, "kl": 0.083984375, "learning_rate": 8.353413654618474e-07, "loss": 0.0034, "reward": 2.2536615133285522, "reward_std": 0.2658763527870178, "rewards/accuracy_reward": 1.2770991325378418, "rewards/format_reward": 0.9765625, "step": 82 }, { "completion_length": 74.58333587646484, "epoch": 1.0, "grad_norm": 2.9272571318373655, "kl": 0.1044921875, "learning_rate": 8.333333333333333e-07, "loss": 0.004, "reward": 2.1187774538993835, "reward_std": 0.1469321921467781, "rewards/accuracy_reward": 1.1187774240970612, "rewards/format_reward": 1.0, "step": 83 }, { "completion_length": 67.5390625, "epoch": 1.0120481927710843, "grad_norm": 4.360041456699287, "kl": 0.116455078125, "learning_rate": 8.313253012048192e-07, "loss": 0.0047, "reward": 2.2748764753341675, "reward_std": 0.30198951065540314, "rewards/accuracy_reward": 1.2983139157295227, "rewards/format_reward": 0.9765625, "step": 84 }, { "completion_length": 71.640625, "epoch": 1.0240963855421688, "grad_norm": 3.852904865115574, "kl": 0.100341796875, "learning_rate": 8.293172690763052e-07, "loss": 0.004, "reward": 2.22179639339447, "reward_std": 0.2614322751760483, "rewards/accuracy_reward": 1.2452340126037598, "rewards/format_reward": 0.9765625, "step": 85 }, { "completion_length": 77.71875, "epoch": 1.036144578313253, "grad_norm": 4.570601093607917, "kl": 0.086181640625, "learning_rate": 8.273092369477911e-07, "loss": 0.0034, "reward": 2.3267804384231567, "reward_std": 0.1871008574962616, "rewards/accuracy_reward": 1.3424054384231567, "rewards/format_reward": 0.984375, "step": 86 }, { "completion_length": 74.0703125, "epoch": 1.0481927710843373, "grad_norm": 4.387034223472388, "kl": 0.09033203125, "learning_rate": 8.253012048192771e-07, "loss": 0.0036, "reward": 2.280067205429077, "reward_std": 0.2090277522802353, "rewards/accuracy_reward": 1.2800670266151428, "rewards/format_reward": 1.0, "step": 87 }, { "completion_length": 72.8828125, "epoch": 1.0602409638554218, "grad_norm": 3.640432077142004, "kl": 0.097412109375, "learning_rate": 8.23293172690763e-07, "loss": 0.0039, "reward": 2.2264442443847656, "reward_std": 0.2877971976995468, "rewards/accuracy_reward": 1.2576942443847656, "rewards/format_reward": 0.96875, "step": 88 }, { "completion_length": 68.9765625, "epoch": 1.072289156626506, "grad_norm": 3.6617214501921755, "kl": 0.10107421875, "learning_rate": 8.21285140562249e-07, "loss": 0.004, "reward": 2.232625722885132, "reward_std": 0.26599176973104477, "rewards/accuracy_reward": 1.2482507824897766, "rewards/format_reward": 0.984375, "step": 89 }, { "completion_length": 74.765625, "epoch": 1.0843373493975903, "grad_norm": 4.600311265578528, "kl": 0.09130859375, "learning_rate": 8.192771084337349e-07, "loss": 0.0037, "reward": 2.253629207611084, "reward_std": 0.21175827831029892, "rewards/accuracy_reward": 1.269254207611084, "rewards/format_reward": 0.984375, "step": 90 }, { "completion_length": 76.59375, "epoch": 1.0963855421686748, "grad_norm": 4.145602929032845, "kl": 0.087646484375, "learning_rate": 8.172690763052207e-07, "loss": 0.0035, "reward": 2.2744953632354736, "reward_std": 0.24358398467302322, "rewards/accuracy_reward": 1.2901203632354736, "rewards/format_reward": 0.984375, "step": 91 }, { "completion_length": 75.875, "epoch": 1.108433734939759, "grad_norm": 3.8292102418969853, "kl": 0.10693359375, "learning_rate": 8.152610441767068e-07, "loss": 0.0043, "reward": 2.4102468490600586, "reward_std": 0.22168071568012238, "rewards/accuracy_reward": 1.4180592894554138, "rewards/format_reward": 0.9921875, "step": 92 }, { "completion_length": 73.5078125, "epoch": 1.1204819277108433, "grad_norm": 3.889694391559541, "kl": 0.0859375, "learning_rate": 8.132530120481927e-07, "loss": 0.0034, "reward": 2.19115674495697, "reward_std": 0.191669300198555, "rewards/accuracy_reward": 1.1989692449569702, "rewards/format_reward": 0.9921875, "step": 93 }, { "completion_length": 74.359375, "epoch": 1.1325301204819278, "grad_norm": 13.572499915490392, "kl": 0.115966796875, "learning_rate": 8.112449799196787e-07, "loss": 0.0046, "reward": 2.3821544647216797, "reward_std": 0.2079356163740158, "rewards/accuracy_reward": 1.3899668455123901, "rewards/format_reward": 0.9921875, "step": 94 }, { "completion_length": 70.875, "epoch": 1.144578313253012, "grad_norm": 3.96863603284974, "kl": 0.096923828125, "learning_rate": 8.092369477911646e-07, "loss": 0.0039, "reward": 2.301279664039612, "reward_std": 0.17724627256393433, "rewards/accuracy_reward": 1.309092104434967, "rewards/format_reward": 0.9921875, "step": 95 }, { "completion_length": 69.3125, "epoch": 1.1566265060240963, "grad_norm": 3.4379001474745206, "kl": 0.090087890625, "learning_rate": 8.072289156626506e-07, "loss": 0.0036, "reward": 2.371612310409546, "reward_std": 0.1584479957818985, "rewards/accuracy_reward": 1.371612310409546, "rewards/format_reward": 1.0, "step": 96 }, { "completion_length": 68.6171875, "epoch": 1.1686746987951806, "grad_norm": 4.586260816062996, "kl": 0.09375, "learning_rate": 8.052208835341365e-07, "loss": 0.0037, "reward": 2.4862219095230103, "reward_std": 0.20000579208135605, "rewards/accuracy_reward": 1.4862220287322998, "rewards/format_reward": 1.0, "step": 97 }, { "completion_length": 70.015625, "epoch": 1.180722891566265, "grad_norm": 4.047101829945655, "kl": 0.112060546875, "learning_rate": 8.032128514056225e-07, "loss": 0.0045, "reward": 2.2514266967773438, "reward_std": 0.22294947504997253, "rewards/accuracy_reward": 1.2514267563819885, "rewards/format_reward": 1.0, "step": 98 }, { "completion_length": 66.9140625, "epoch": 1.1927710843373494, "grad_norm": 5.444249065473958, "kl": 0.088134765625, "learning_rate": 8.012048192771084e-07, "loss": 0.0035, "reward": 2.333179473876953, "reward_std": 0.1811930388212204, "rewards/accuracy_reward": 1.3331794738769531, "rewards/format_reward": 1.0, "step": 99 }, { "completion_length": 65.828125, "epoch": 1.2048192771084336, "grad_norm": 7.074570957060863, "kl": 0.1064453125, "learning_rate": 7.991967871485942e-07, "loss": 0.0043, "reward": 2.278498649597168, "reward_std": 0.17714769393205643, "rewards/accuracy_reward": 1.2863109111785889, "rewards/format_reward": 0.9921875, "step": 100 }, { "completion_length": 62.6875, "epoch": 1.216867469879518, "grad_norm": 6.600402598086416, "kl": 0.099609375, "learning_rate": 7.971887550200803e-07, "loss": 0.004, "reward": 2.3798866271972656, "reward_std": 0.1492375209927559, "rewards/accuracy_reward": 1.3798866868019104, "rewards/format_reward": 1.0, "step": 101 }, { "completion_length": 67.234375, "epoch": 1.2289156626506024, "grad_norm": 5.4322907915163645, "kl": 0.0927734375, "learning_rate": 7.951807228915662e-07, "loss": 0.0037, "reward": 2.295409917831421, "reward_std": 0.26540718972682953, "rewards/accuracy_reward": 1.311034917831421, "rewards/format_reward": 0.984375, "step": 102 }, { "completion_length": 62.59375, "epoch": 1.2409638554216866, "grad_norm": 4.734234621294123, "kl": 0.10986328125, "learning_rate": 7.931726907630522e-07, "loss": 0.0044, "reward": 2.3131519556045532, "reward_std": 0.2041746824979782, "rewards/accuracy_reward": 1.3209643959999084, "rewards/format_reward": 0.9921875, "step": 103 }, { "completion_length": 65.0078125, "epoch": 1.2530120481927711, "grad_norm": 11.27432402123553, "kl": 0.094482421875, "learning_rate": 7.911646586345381e-07, "loss": 0.0038, "reward": 2.423591375350952, "reward_std": 0.17853456735610962, "rewards/accuracy_reward": 1.4235913753509521, "rewards/format_reward": 1.0, "step": 104 }, { "completion_length": 61.96875, "epoch": 1.2650602409638554, "grad_norm": 5.605209449566961, "kl": 0.10595703125, "learning_rate": 7.891566265060241e-07, "loss": 0.0042, "reward": 2.2498486042022705, "reward_std": 0.2505866587162018, "rewards/accuracy_reward": 1.2576610445976257, "rewards/format_reward": 0.9921875, "step": 105 }, { "completion_length": 69.890625, "epoch": 1.2771084337349397, "grad_norm": 9.555144265496201, "kl": 0.1015625, "learning_rate": 7.8714859437751e-07, "loss": 0.0041, "reward": 2.153669834136963, "reward_std": 0.2159716784954071, "rewards/accuracy_reward": 1.161482334136963, "rewards/format_reward": 0.9921875, "step": 106 }, { "completion_length": 63.5625, "epoch": 1.2891566265060241, "grad_norm": 4.205528221959235, "kl": 0.100341796875, "learning_rate": 7.851405622489959e-07, "loss": 0.004, "reward": 2.2599010467529297, "reward_std": 0.22189538180828094, "rewards/accuracy_reward": 1.2599008083343506, "rewards/format_reward": 1.0, "step": 107 }, { "completion_length": 60.3359375, "epoch": 1.3012048192771084, "grad_norm": 4.549607105799596, "kl": 0.13525390625, "learning_rate": 7.831325301204819e-07, "loss": 0.0054, "reward": 2.2945663928985596, "reward_std": 0.2269488275051117, "rewards/accuracy_reward": 1.2945663928985596, "rewards/format_reward": 1.0, "step": 108 }, { "completion_length": 63.9765625, "epoch": 1.3132530120481927, "grad_norm": 7.122658458301131, "kl": 0.10400390625, "learning_rate": 7.811244979919679e-07, "loss": 0.0042, "reward": 2.223813772201538, "reward_std": 0.2691728472709656, "rewards/accuracy_reward": 1.2316263318061829, "rewards/format_reward": 0.9921875, "step": 109 }, { "completion_length": 64.0390625, "epoch": 1.3253012048192772, "grad_norm": 4.0970391288989285, "kl": 0.102783203125, "learning_rate": 7.791164658634538e-07, "loss": 0.0041, "reward": 2.402035713195801, "reward_std": 0.2192593812942505, "rewards/accuracy_reward": 1.409848153591156, "rewards/format_reward": 0.9921875, "step": 110 }, { "completion_length": 61.984375, "epoch": 1.3373493975903614, "grad_norm": 5.00798288991921, "kl": 0.100830078125, "learning_rate": 7.771084337349397e-07, "loss": 0.004, "reward": 2.268544912338257, "reward_std": 0.17878198623657227, "rewards/accuracy_reward": 1.2685450315475464, "rewards/format_reward": 1.0, "step": 111 }, { "completion_length": 58.296875, "epoch": 1.3493975903614457, "grad_norm": 4.283142882967245, "kl": 0.10888671875, "learning_rate": 7.751004016064257e-07, "loss": 0.0044, "reward": 2.373852849006653, "reward_std": 0.17504306137561798, "rewards/accuracy_reward": 1.3738529086112976, "rewards/format_reward": 1.0, "step": 112 }, { "completion_length": 60.484375, "epoch": 1.3614457831325302, "grad_norm": 4.840347639337677, "kl": 0.097412109375, "learning_rate": 7.730923694779116e-07, "loss": 0.0039, "reward": 2.2944198846817017, "reward_std": 0.2088237851858139, "rewards/accuracy_reward": 1.2944198250770569, "rewards/format_reward": 1.0, "step": 113 }, { "completion_length": 59.6328125, "epoch": 1.3734939759036144, "grad_norm": 3.441438097506757, "kl": 0.095458984375, "learning_rate": 7.710843373493975e-07, "loss": 0.0038, "reward": 2.2015284299850464, "reward_std": 0.22288134694099426, "rewards/accuracy_reward": 1.201528549194336, "rewards/format_reward": 1.0, "step": 114 }, { "completion_length": 58.3203125, "epoch": 1.3855421686746987, "grad_norm": 5.2560716101244545, "kl": 0.12890625, "learning_rate": 7.690763052208835e-07, "loss": 0.0052, "reward": 2.395646095275879, "reward_std": 0.21848639845848083, "rewards/accuracy_reward": 1.3956461548805237, "rewards/format_reward": 1.0, "step": 115 }, { "completion_length": 58.2734375, "epoch": 1.3975903614457832, "grad_norm": 5.450406858307557, "kl": 0.1064453125, "learning_rate": 7.670682730923694e-07, "loss": 0.0043, "reward": 2.4746010303497314, "reward_std": 0.1482101045548916, "rewards/accuracy_reward": 1.4746010303497314, "rewards/format_reward": 1.0, "step": 116 }, { "completion_length": 57.65625, "epoch": 1.4096385542168675, "grad_norm": 4.642950561404122, "kl": 0.124267578125, "learning_rate": 7.650602409638554e-07, "loss": 0.005, "reward": 2.1899147033691406, "reward_std": 0.2073155865073204, "rewards/accuracy_reward": 1.1977271437644958, "rewards/format_reward": 0.9921875, "step": 117 }, { "completion_length": 56.609375, "epoch": 1.4216867469879517, "grad_norm": 9.36763410057133, "kl": 0.112548828125, "learning_rate": 7.630522088353414e-07, "loss": 0.0045, "reward": 2.457427501678467, "reward_std": 0.248141810297966, "rewards/accuracy_reward": 1.4574276804924011, "rewards/format_reward": 1.0, "step": 118 }, { "completion_length": 55.59375, "epoch": 1.4337349397590362, "grad_norm": 4.076025029890633, "kl": 0.095947265625, "learning_rate": 7.610441767068273e-07, "loss": 0.0038, "reward": 2.3175806999206543, "reward_std": 0.21353702247142792, "rewards/accuracy_reward": 1.3175806999206543, "rewards/format_reward": 1.0, "step": 119 }, { "completion_length": 56.359375, "epoch": 1.4457831325301205, "grad_norm": 4.1118838634058905, "kl": 0.10693359375, "learning_rate": 7.590361445783132e-07, "loss": 0.0043, "reward": 2.306099772453308, "reward_std": 0.2674330025911331, "rewards/accuracy_reward": 1.3217247128486633, "rewards/format_reward": 0.984375, "step": 120 }, { "completion_length": 56.765625, "epoch": 1.4578313253012047, "grad_norm": 4.370520474393478, "kl": 0.10302734375, "learning_rate": 7.570281124497991e-07, "loss": 0.0041, "reward": 2.1378331184387207, "reward_std": 0.24683931469917297, "rewards/accuracy_reward": 1.1378332376480103, "rewards/format_reward": 1.0, "step": 121 }, { "completion_length": 61.4453125, "epoch": 1.4698795180722892, "grad_norm": 3.7827942646929427, "kl": 0.120361328125, "learning_rate": 7.550200803212851e-07, "loss": 0.0048, "reward": 2.1952574253082275, "reward_std": 0.163675457239151, "rewards/accuracy_reward": 1.1952574849128723, "rewards/format_reward": 1.0, "step": 122 }, { "completion_length": 64.2734375, "epoch": 1.4819277108433735, "grad_norm": 3.7942059326042887, "kl": 0.115478515625, "learning_rate": 7.53012048192771e-07, "loss": 0.0046, "reward": 2.052876114845276, "reward_std": 0.3279467225074768, "rewards/accuracy_reward": 1.0606885850429535, "rewards/format_reward": 0.9921875, "step": 123 }, { "completion_length": 61.7578125, "epoch": 1.4939759036144578, "grad_norm": 4.163145774578374, "kl": 0.1083984375, "learning_rate": 7.51004016064257e-07, "loss": 0.0043, "reward": 2.483773946762085, "reward_std": 0.21236886084079742, "rewards/accuracy_reward": 1.483773946762085, "rewards/format_reward": 1.0, "step": 124 }, { "completion_length": 69.8359375, "epoch": 1.5060240963855422, "grad_norm": 8.540024207287942, "kl": 0.122314453125, "learning_rate": 7.489959839357429e-07, "loss": 0.0049, "reward": 2.207366466522217, "reward_std": 0.22365009784698486, "rewards/accuracy_reward": 1.2073664665222168, "rewards/format_reward": 1.0, "step": 125 }, { "completion_length": 68.21875, "epoch": 1.5180722891566265, "grad_norm": 4.163585518888115, "kl": 0.097412109375, "learning_rate": 7.469879518072289e-07, "loss": 0.0039, "reward": 2.3682451248168945, "reward_std": 0.17314215004444122, "rewards/accuracy_reward": 1.3682451844215393, "rewards/format_reward": 1.0, "step": 126 }, { "completion_length": 74.7734375, "epoch": 1.5301204819277108, "grad_norm": 5.7954755578535595, "kl": 0.09912109375, "learning_rate": 7.449799196787149e-07, "loss": 0.004, "reward": 2.3054428100585938, "reward_std": 0.166117824614048, "rewards/accuracy_reward": 1.313255250453949, "rewards/format_reward": 0.9921875, "step": 127 }, { "completion_length": 77.3046875, "epoch": 1.5421686746987953, "grad_norm": 4.318669163836461, "kl": 0.091796875, "learning_rate": 7.429718875502008e-07, "loss": 0.0037, "reward": 2.1308990716934204, "reward_std": 0.19852972030639648, "rewards/accuracy_reward": 1.13089919090271, "rewards/format_reward": 1.0, "step": 128 }, { "completion_length": 78.1015625, "epoch": 1.5542168674698795, "grad_norm": 4.096032296356097, "kl": 0.102783203125, "learning_rate": 7.409638554216867e-07, "loss": 0.0041, "reward": 2.445680260658264, "reward_std": 0.1704091727733612, "rewards/accuracy_reward": 1.4456802010536194, "rewards/format_reward": 1.0, "step": 129 }, { "completion_length": 74.75, "epoch": 1.5662650602409638, "grad_norm": 4.47404453525868, "kl": 0.100341796875, "learning_rate": 7.389558232931726e-07, "loss": 0.004, "reward": 2.2448705434799194, "reward_std": 0.21340852975845337, "rewards/accuracy_reward": 1.2448704838752747, "rewards/format_reward": 1.0, "step": 130 }, { "completion_length": 75.3671875, "epoch": 1.5783132530120483, "grad_norm": 23.135090346261265, "kl": 1.1025390625, "learning_rate": 7.369477911646586e-07, "loss": 0.0444, "reward": 2.368005871772766, "reward_std": 0.24276328086853027, "rewards/accuracy_reward": 1.3680058717727661, "rewards/format_reward": 1.0, "step": 131 }, { "completion_length": 76.5234375, "epoch": 1.5903614457831325, "grad_norm": 3.560296625305877, "kl": 0.14111328125, "learning_rate": 7.349397590361446e-07, "loss": 0.0056, "reward": 2.3832234144210815, "reward_std": 0.2271246314048767, "rewards/accuracy_reward": 1.398848533630371, "rewards/format_reward": 0.984375, "step": 132 }, { "completion_length": 78.515625, "epoch": 1.6024096385542168, "grad_norm": 4.271885997013165, "kl": 0.103271484375, "learning_rate": 7.329317269076305e-07, "loss": 0.0041, "reward": 2.11967396736145, "reward_std": 0.21069814264774323, "rewards/accuracy_reward": 1.119674026966095, "rewards/format_reward": 1.0, "step": 133 }, { "completion_length": 81.2109375, "epoch": 1.6144578313253013, "grad_norm": 3.989749340172797, "kl": 0.10009765625, "learning_rate": 7.309236947791164e-07, "loss": 0.004, "reward": 2.2381746768951416, "reward_std": 0.2712934762239456, "rewards/accuracy_reward": 1.2537997961044312, "rewards/format_reward": 0.984375, "step": 134 }, { "completion_length": 84.828125, "epoch": 1.6265060240963856, "grad_norm": 5.101727030105181, "kl": 0.0927734375, "learning_rate": 7.289156626506024e-07, "loss": 0.0037, "reward": 2.3006190061569214, "reward_std": 0.2388201355934143, "rewards/accuracy_reward": 1.3084314465522766, "rewards/format_reward": 0.9921875, "step": 135 }, { "completion_length": 78.3984375, "epoch": 1.6385542168674698, "grad_norm": 7.945369222479043, "kl": 0.109130859375, "learning_rate": 7.269076305220884e-07, "loss": 0.0044, "reward": 2.187756061553955, "reward_std": 0.22536994516849518, "rewards/accuracy_reward": 1.2033808827400208, "rewards/format_reward": 0.984375, "step": 136 }, { "completion_length": 83.0234375, "epoch": 1.6506024096385543, "grad_norm": 7.511759922163927, "kl": 0.074462890625, "learning_rate": 7.248995983935742e-07, "loss": 0.003, "reward": 2.299572706222534, "reward_std": 0.22408785670995712, "rewards/accuracy_reward": 1.3073852062225342, "rewards/format_reward": 0.9921875, "step": 137 }, { "completion_length": 84.640625, "epoch": 1.6626506024096386, "grad_norm": 3.2982396535282623, "kl": 0.0810546875, "learning_rate": 7.228915662650602e-07, "loss": 0.0032, "reward": 2.3804391622543335, "reward_std": 0.2060808688402176, "rewards/accuracy_reward": 1.3804389834403992, "rewards/format_reward": 1.0, "step": 138 }, { "completion_length": 87.8125, "epoch": 1.6746987951807228, "grad_norm": 8.41708008218346, "kl": 0.0810546875, "learning_rate": 7.208835341365461e-07, "loss": 0.0032, "reward": 2.2146860361099243, "reward_std": 0.2540859431028366, "rewards/accuracy_reward": 1.2146860361099243, "rewards/format_reward": 1.0, "step": 139 }, { "completion_length": 86.140625, "epoch": 1.6867469879518073, "grad_norm": 3.5435273544538815, "kl": 0.072998046875, "learning_rate": 7.188755020080321e-07, "loss": 0.0029, "reward": 2.3307693004608154, "reward_std": 0.20385809987783432, "rewards/accuracy_reward": 1.3385818004608154, "rewards/format_reward": 0.9921875, "step": 140 }, { "completion_length": 85.9375, "epoch": 1.6987951807228916, "grad_norm": 3.544683408089574, "kl": 0.083984375, "learning_rate": 7.168674698795181e-07, "loss": 0.0034, "reward": 2.2913438081741333, "reward_std": 0.26863446831703186, "rewards/accuracy_reward": 1.3069688081741333, "rewards/format_reward": 0.984375, "step": 141 }, { "completion_length": 83.2578125, "epoch": 1.7108433734939759, "grad_norm": 4.741927242341381, "kl": 0.12548828125, "learning_rate": 7.14859437751004e-07, "loss": 0.005, "reward": 2.3960628509521484, "reward_std": 0.2550785541534424, "rewards/accuracy_reward": 1.3960627913475037, "rewards/format_reward": 1.0, "step": 142 }, { "completion_length": 86.671875, "epoch": 1.7228915662650603, "grad_norm": 3.0874349711182494, "kl": 0.07470703125, "learning_rate": 7.128514056224899e-07, "loss": 0.003, "reward": 2.3813560009002686, "reward_std": 0.25298502296209335, "rewards/accuracy_reward": 1.381356120109558, "rewards/format_reward": 1.0, "step": 143 }, { "completion_length": 80.40625, "epoch": 1.7349397590361446, "grad_norm": 9.215211678123678, "kl": 0.085693359375, "learning_rate": 7.108433734939758e-07, "loss": 0.0034, "reward": 2.3150322437286377, "reward_std": 0.23231424391269684, "rewards/accuracy_reward": 1.315032422542572, "rewards/format_reward": 1.0, "step": 144 }, { "completion_length": 79.5859375, "epoch": 1.7469879518072289, "grad_norm": 3.3677362414264307, "kl": 0.098876953125, "learning_rate": 7.088353413654619e-07, "loss": 0.0039, "reward": 2.2901567220687866, "reward_std": 0.21487458050251007, "rewards/accuracy_reward": 1.2979693412780762, "rewards/format_reward": 0.9921875, "step": 145 }, { "completion_length": 87.2734375, "epoch": 1.7590361445783134, "grad_norm": 3.8053306313986037, "kl": 0.104736328125, "learning_rate": 7.068273092369477e-07, "loss": 0.0042, "reward": 2.2074761390686035, "reward_std": 0.24223129451274872, "rewards/accuracy_reward": 1.2074760794639587, "rewards/format_reward": 1.0, "step": 146 }, { "completion_length": 88.984375, "epoch": 1.7710843373493976, "grad_norm": 4.960937467624004, "kl": 0.08251953125, "learning_rate": 7.048192771084337e-07, "loss": 0.0033, "reward": 2.2357683181762695, "reward_std": 0.2608248367905617, "rewards/accuracy_reward": 1.2435806393623352, "rewards/format_reward": 0.9921875, "step": 147 }, { "completion_length": 80.421875, "epoch": 1.783132530120482, "grad_norm": 3.5313461555382717, "kl": 0.106689453125, "learning_rate": 7.028112449799196e-07, "loss": 0.0042, "reward": 2.223365068435669, "reward_std": 0.20793087780475616, "rewards/accuracy_reward": 1.2311774492263794, "rewards/format_reward": 0.9921875, "step": 148 }, { "completion_length": 81.6328125, "epoch": 1.7951807228915664, "grad_norm": 3.917968857756188, "kl": 0.082763671875, "learning_rate": 7.008032128514057e-07, "loss": 0.0033, "reward": 2.431049346923828, "reward_std": 0.25210463255643845, "rewards/accuracy_reward": 1.4310495257377625, "rewards/format_reward": 1.0, "step": 149 }, { "completion_length": 82.71875, "epoch": 1.8072289156626506, "grad_norm": 3.2751640437820417, "kl": 0.105224609375, "learning_rate": 6.987951807228916e-07, "loss": 0.0042, "reward": 2.167607069015503, "reward_std": 0.20023201406002045, "rewards/accuracy_reward": 1.183232069015503, "rewards/format_reward": 0.984375, "step": 150 }, { "completion_length": 80.1015625, "epoch": 1.819277108433735, "grad_norm": 3.696030829693263, "kl": 0.09716796875, "learning_rate": 6.967871485943774e-07, "loss": 0.0039, "reward": 2.545083999633789, "reward_std": 0.17634352296590805, "rewards/accuracy_reward": 1.5450841188430786, "rewards/format_reward": 1.0, "step": 151 }, { "completion_length": 81.6484375, "epoch": 1.8313253012048194, "grad_norm": 5.419229696650584, "kl": 0.119873046875, "learning_rate": 6.947791164658634e-07, "loss": 0.0048, "reward": 2.144273281097412, "reward_std": 0.2491978257894516, "rewards/accuracy_reward": 1.152085781097412, "rewards/format_reward": 0.9921875, "step": 152 }, { "completion_length": 77.96875, "epoch": 1.8433734939759037, "grad_norm": 34.81233821704641, "kl": 0.09619140625, "learning_rate": 6.927710843373493e-07, "loss": 0.0039, "reward": 2.4207249879837036, "reward_std": 0.22066732123494148, "rewards/accuracy_reward": 1.4207251071929932, "rewards/format_reward": 1.0, "step": 153 }, { "completion_length": 81.3984375, "epoch": 1.855421686746988, "grad_norm": 4.095705367504911, "kl": 0.101806640625, "learning_rate": 6.907630522088354e-07, "loss": 0.0041, "reward": 2.160383105278015, "reward_std": 0.27165083587169647, "rewards/accuracy_reward": 1.1681956052780151, "rewards/format_reward": 0.9921875, "step": 154 }, { "completion_length": 79.78125, "epoch": 1.8674698795180724, "grad_norm": 3.0440685644807663, "kl": 0.11865234375, "learning_rate": 6.887550200803212e-07, "loss": 0.0047, "reward": 2.4971319437026978, "reward_std": 0.16808781027793884, "rewards/accuracy_reward": 1.4971320629119873, "rewards/format_reward": 1.0, "step": 155 }, { "completion_length": 83.09375, "epoch": 1.8795180722891565, "grad_norm": 3.1771226883841206, "kl": 0.10498046875, "learning_rate": 6.867469879518072e-07, "loss": 0.0042, "reward": 2.1450811624526978, "reward_std": 0.2694619745016098, "rewards/accuracy_reward": 1.1450812816619873, "rewards/format_reward": 1.0, "step": 156 }, { "completion_length": 81.9453125, "epoch": 1.891566265060241, "grad_norm": 3.4230588560037583, "kl": 0.113525390625, "learning_rate": 6.847389558232931e-07, "loss": 0.0045, "reward": 2.44959032535553, "reward_std": 0.16196198761463165, "rewards/accuracy_reward": 1.4574028253555298, "rewards/format_reward": 0.9921875, "step": 157 }, { "completion_length": 86.203125, "epoch": 1.9036144578313254, "grad_norm": 5.9344079114737, "kl": 0.1015625, "learning_rate": 6.827309236947792e-07, "loss": 0.0041, "reward": 2.1924350261688232, "reward_std": 0.1869198903441429, "rewards/accuracy_reward": 1.1924351453781128, "rewards/format_reward": 1.0, "step": 158 }, { "completion_length": 84.7734375, "epoch": 1.9156626506024095, "grad_norm": 3.7338258911048707, "kl": 0.105224609375, "learning_rate": 6.807228915662651e-07, "loss": 0.0042, "reward": 2.298088550567627, "reward_std": 0.2152806669473648, "rewards/accuracy_reward": 1.3059011697769165, "rewards/format_reward": 0.9921875, "step": 159 }, { "completion_length": 88.2109375, "epoch": 1.927710843373494, "grad_norm": 3.2737012532681535, "kl": 0.124755859375, "learning_rate": 6.787148594377509e-07, "loss": 0.005, "reward": 2.3695740699768066, "reward_std": 0.300421878695488, "rewards/accuracy_reward": 1.3930113911628723, "rewards/format_reward": 0.9765625, "step": 160 }, { "completion_length": 82.9921875, "epoch": 1.9397590361445785, "grad_norm": 14.347253854862437, "kl": 0.119873046875, "learning_rate": 6.767068273092369e-07, "loss": 0.0048, "reward": 2.306626796722412, "reward_std": 0.2548489645123482, "rewards/accuracy_reward": 1.3222516179084778, "rewards/format_reward": 0.984375, "step": 161 }, { "completion_length": 87.734375, "epoch": 1.9518072289156625, "grad_norm": 3.457686333163172, "kl": 0.109375, "learning_rate": 6.746987951807228e-07, "loss": 0.0044, "reward": 2.2328758239746094, "reward_std": 0.28791245073080063, "rewards/accuracy_reward": 1.2641257643699646, "rewards/format_reward": 0.96875, "step": 162 }, { "completion_length": 83.25, "epoch": 1.963855421686747, "grad_norm": 4.1768305143971824, "kl": 0.12353515625, "learning_rate": 6.726907630522089e-07, "loss": 0.0049, "reward": 2.2161502838134766, "reward_std": 0.25863420963287354, "rewards/accuracy_reward": 1.2630252242088318, "rewards/format_reward": 0.953125, "step": 163 }, { "completion_length": 88.734375, "epoch": 1.9759036144578315, "grad_norm": 4.842793088552531, "kl": 0.105712890625, "learning_rate": 6.706827309236947e-07, "loss": 0.0042, "reward": 2.090719521045685, "reward_std": 0.25029148161411285, "rewards/accuracy_reward": 1.1141569316387177, "rewards/format_reward": 0.9765625, "step": 164 }, { "completion_length": 86.1953125, "epoch": 1.9879518072289155, "grad_norm": 3.657481472750154, "kl": 0.125244140625, "learning_rate": 6.686746987951807e-07, "loss": 0.005, "reward": 2.2765581607818604, "reward_std": 0.2915503680706024, "rewards/accuracy_reward": 1.30780827999115, "rewards/format_reward": 0.96875, "step": 165 }, { "completion_length": 92.16666793823242, "epoch": 2.0, "grad_norm": 3.6057161188599776, "kl": 0.125732421875, "learning_rate": 6.666666666666666e-07, "loss": 0.0047, "reward": 2.234604835510254, "reward_std": 0.2570358142256737, "rewards/accuracy_reward": 1.2346049845218658, "rewards/format_reward": 1.0, "step": 166 }, { "completion_length": 87.1484375, "epoch": 2.0120481927710845, "grad_norm": 3.7603470456590564, "kl": 0.094482421875, "learning_rate": 6.646586345381526e-07, "loss": 0.0038, "reward": 2.2034374475479126, "reward_std": 0.3387380540370941, "rewards/accuracy_reward": 1.2112498879432678, "rewards/format_reward": 0.9921875, "step": 167 }, { "completion_length": 86.1953125, "epoch": 2.0240963855421685, "grad_norm": 4.4381952945033465, "kl": 0.09765625, "learning_rate": 6.626506024096386e-07, "loss": 0.0039, "reward": 2.222957730293274, "reward_std": 0.2284381240606308, "rewards/accuracy_reward": 1.238582730293274, "rewards/format_reward": 0.984375, "step": 168 }, { "completion_length": 84.3125, "epoch": 2.036144578313253, "grad_norm": 3.399081917667578, "kl": 0.0966796875, "learning_rate": 6.606425702811244e-07, "loss": 0.0039, "reward": 2.2074966430664062, "reward_std": 0.2783028930425644, "rewards/accuracy_reward": 1.2231215238571167, "rewards/format_reward": 0.984375, "step": 169 }, { "completion_length": 84.1640625, "epoch": 2.0481927710843375, "grad_norm": 3.794821230336393, "kl": 0.10400390625, "learning_rate": 6.586345381526104e-07, "loss": 0.0042, "reward": 2.2774429321289062, "reward_std": 0.18755661696195602, "rewards/accuracy_reward": 1.2774428129196167, "rewards/format_reward": 1.0, "step": 170 }, { "completion_length": 84.7421875, "epoch": 2.0602409638554215, "grad_norm": 5.41653478361753, "kl": 0.09130859375, "learning_rate": 6.566265060240963e-07, "loss": 0.0036, "reward": 2.2825827598571777, "reward_std": 0.20142250508069992, "rewards/accuracy_reward": 1.2825825810432434, "rewards/format_reward": 1.0, "step": 171 }, { "completion_length": 78.421875, "epoch": 2.072289156626506, "grad_norm": 4.831319526617051, "kl": 0.099365234375, "learning_rate": 6.546184738955824e-07, "loss": 0.004, "reward": 2.4247552156448364, "reward_std": 0.19953592866659164, "rewards/accuracy_reward": 1.4247552752494812, "rewards/format_reward": 1.0, "step": 172 }, { "completion_length": 78.359375, "epoch": 2.0843373493975905, "grad_norm": 3.8109915515963038, "kl": 0.10498046875, "learning_rate": 6.526104417670682e-07, "loss": 0.0042, "reward": 2.3325507640838623, "reward_std": 0.26026056706905365, "rewards/accuracy_reward": 1.348175823688507, "rewards/format_reward": 0.984375, "step": 173 }, { "completion_length": 79.21875, "epoch": 2.0963855421686746, "grad_norm": 4.94758596751216, "kl": 0.130615234375, "learning_rate": 6.506024096385541e-07, "loss": 0.0052, "reward": 2.3614529371261597, "reward_std": 0.23941361159086227, "rewards/accuracy_reward": 1.3614528179168701, "rewards/format_reward": 1.0, "step": 174 }, { "completion_length": 80.8984375, "epoch": 2.108433734939759, "grad_norm": 4.645980861130919, "kl": 0.12646484375, "learning_rate": 6.485943775100401e-07, "loss": 0.0051, "reward": 2.148719310760498, "reward_std": 0.2538711354136467, "rewards/accuracy_reward": 1.1487191915512085, "rewards/format_reward": 1.0, "step": 175 }, { "completion_length": 78.921875, "epoch": 2.1204819277108435, "grad_norm": 3.362542245290514, "kl": 0.090576171875, "learning_rate": 6.465863453815261e-07, "loss": 0.0036, "reward": 2.3466458320617676, "reward_std": 0.21008533239364624, "rewards/accuracy_reward": 1.346645712852478, "rewards/format_reward": 1.0, "step": 176 }, { "completion_length": 78.5546875, "epoch": 2.1325301204819276, "grad_norm": 3.6960106974538585, "kl": 0.0908203125, "learning_rate": 6.445783132530121e-07, "loss": 0.0036, "reward": 2.4223729372024536, "reward_std": 0.15239863470196724, "rewards/accuracy_reward": 1.4223730564117432, "rewards/format_reward": 1.0, "step": 177 }, { "completion_length": 76.890625, "epoch": 2.144578313253012, "grad_norm": 3.5646239400027913, "kl": 0.103515625, "learning_rate": 6.425702811244979e-07, "loss": 0.0041, "reward": 2.4388126134872437, "reward_std": 0.22842204570770264, "rewards/accuracy_reward": 1.4466250538825989, "rewards/format_reward": 0.9921875, "step": 178 }, { "completion_length": 78.796875, "epoch": 2.1566265060240966, "grad_norm": 3.531186908359453, "kl": 0.099609375, "learning_rate": 6.405622489959839e-07, "loss": 0.004, "reward": 2.1039586067199707, "reward_std": 0.23404612392187119, "rewards/accuracy_reward": 1.1273961663246155, "rewards/format_reward": 0.9765625, "step": 179 }, { "completion_length": 75.75, "epoch": 2.1686746987951806, "grad_norm": 5.0096541073452485, "kl": 0.1015625, "learning_rate": 6.385542168674698e-07, "loss": 0.0041, "reward": 2.374882221221924, "reward_std": 0.2003496214747429, "rewards/accuracy_reward": 1.374882161617279, "rewards/format_reward": 1.0, "step": 180 }, { "completion_length": 79.9375, "epoch": 2.180722891566265, "grad_norm": 3.929802835585037, "kl": 0.102294921875, "learning_rate": 6.365461847389559e-07, "loss": 0.0041, "reward": 2.4310786724090576, "reward_std": 0.20660096406936646, "rewards/accuracy_reward": 1.4310787916183472, "rewards/format_reward": 1.0, "step": 181 }, { "completion_length": 80.7578125, "epoch": 2.1927710843373496, "grad_norm": 4.226674931816659, "kl": 0.09619140625, "learning_rate": 6.345381526104418e-07, "loss": 0.0038, "reward": 2.3952780961990356, "reward_std": 0.2160111963748932, "rewards/accuracy_reward": 1.3952780961990356, "rewards/format_reward": 1.0, "step": 182 }, { "completion_length": 80.484375, "epoch": 2.2048192771084336, "grad_norm": 3.463553859166022, "kl": 0.107421875, "learning_rate": 6.325301204819276e-07, "loss": 0.0043, "reward": 2.3913345336914062, "reward_std": 0.22311442345380783, "rewards/accuracy_reward": 1.3991470336914062, "rewards/format_reward": 0.9921875, "step": 183 }, { "completion_length": 78.484375, "epoch": 2.216867469879518, "grad_norm": 3.9553841913647356, "kl": 0.08642578125, "learning_rate": 6.305220883534136e-07, "loss": 0.0035, "reward": 2.353707432746887, "reward_std": 0.2809625118970871, "rewards/accuracy_reward": 1.3615199327468872, "rewards/format_reward": 0.9921875, "step": 184 }, { "completion_length": 86.203125, "epoch": 2.2289156626506026, "grad_norm": 6.103835532514207, "kl": 0.075439453125, "learning_rate": 6.285140562248996e-07, "loss": 0.003, "reward": 2.411812663078308, "reward_std": 0.17931858450174332, "rewards/accuracy_reward": 1.411812663078308, "rewards/format_reward": 1.0, "step": 185 }, { "completion_length": 77.515625, "epoch": 2.2409638554216866, "grad_norm": 3.91857543195832, "kl": 0.10107421875, "learning_rate": 6.265060240963856e-07, "loss": 0.004, "reward": 2.2299575805664062, "reward_std": 0.2100789025425911, "rewards/accuracy_reward": 1.2377700209617615, "rewards/format_reward": 0.9921875, "step": 186 }, { "completion_length": 77.09375, "epoch": 2.253012048192771, "grad_norm": 3.8592654709883796, "kl": 0.095947265625, "learning_rate": 6.244979919678714e-07, "loss": 0.0038, "reward": 2.47510826587677, "reward_std": 0.2556135207414627, "rewards/accuracy_reward": 1.4829206466674805, "rewards/format_reward": 0.9921875, "step": 187 }, { "completion_length": 79.2890625, "epoch": 2.2650602409638556, "grad_norm": 6.921774157099546, "kl": 0.093017578125, "learning_rate": 6.224899598393574e-07, "loss": 0.0037, "reward": 2.3394941091537476, "reward_std": 0.23163118958473206, "rewards/accuracy_reward": 1.3394939303398132, "rewards/format_reward": 1.0, "step": 188 }, { "completion_length": 79.546875, "epoch": 2.2771084337349397, "grad_norm": 5.699992937395376, "kl": 0.08544921875, "learning_rate": 6.204819277108434e-07, "loss": 0.0034, "reward": 2.330021381378174, "reward_std": 0.21045994758605957, "rewards/accuracy_reward": 1.3300212621688843, "rewards/format_reward": 1.0, "step": 189 }, { "completion_length": 77.421875, "epoch": 2.289156626506024, "grad_norm": 4.425700742489554, "kl": 0.098388671875, "learning_rate": 6.184738955823293e-07, "loss": 0.0039, "reward": 2.2294440269470215, "reward_std": 0.21671444922685623, "rewards/accuracy_reward": 1.2294440865516663, "rewards/format_reward": 1.0, "step": 190 }, { "completion_length": 74.6640625, "epoch": 2.3012048192771086, "grad_norm": 3.5141288907091783, "kl": 0.08154296875, "learning_rate": 6.164658634538153e-07, "loss": 0.0033, "reward": 2.417364239692688, "reward_std": 0.18784678727388382, "rewards/accuracy_reward": 1.4173641800880432, "rewards/format_reward": 1.0, "step": 191 }, { "completion_length": 74.53125, "epoch": 2.3132530120481927, "grad_norm": 4.6610918738389095, "kl": 0.096435546875, "learning_rate": 6.144578313253011e-07, "loss": 0.0039, "reward": 2.4048426151275635, "reward_std": 0.2764005810022354, "rewards/accuracy_reward": 1.412655234336853, "rewards/format_reward": 0.9921875, "step": 192 }, { "completion_length": 80.8984375, "epoch": 2.325301204819277, "grad_norm": 6.933183617809393, "kl": 0.07861328125, "learning_rate": 6.124497991967871e-07, "loss": 0.0031, "reward": 2.2180745601654053, "reward_std": 0.2127843052148819, "rewards/accuracy_reward": 1.21807461977005, "rewards/format_reward": 1.0, "step": 193 }, { "completion_length": 80.9296875, "epoch": 2.337349397590361, "grad_norm": 4.526116466506062, "kl": 0.088623046875, "learning_rate": 6.104417670682731e-07, "loss": 0.0035, "reward": 2.2327487468719482, "reward_std": 0.2369586005806923, "rewards/accuracy_reward": 1.240561306476593, "rewards/format_reward": 0.9921875, "step": 194 }, { "completion_length": 79.8359375, "epoch": 2.3493975903614457, "grad_norm": 3.410370565415923, "kl": 0.09326171875, "learning_rate": 6.084337349397591e-07, "loss": 0.0037, "reward": 2.222264051437378, "reward_std": 0.26303592324256897, "rewards/accuracy_reward": 1.230076551437378, "rewards/format_reward": 0.9921875, "step": 195 }, { "completion_length": 73.8828125, "epoch": 2.36144578313253, "grad_norm": 3.962197046428477, "kl": 0.103271484375, "learning_rate": 6.064257028112449e-07, "loss": 0.0041, "reward": 2.296523690223694, "reward_std": 0.370675727725029, "rewards/accuracy_reward": 1.2965235710144043, "rewards/format_reward": 1.0, "step": 196 }, { "completion_length": 74.515625, "epoch": 2.3734939759036147, "grad_norm": 3.7849181083166066, "kl": 0.100341796875, "learning_rate": 6.044176706827308e-07, "loss": 0.004, "reward": 2.1898573637008667, "reward_std": 0.2903239354491234, "rewards/accuracy_reward": 1.1898574829101562, "rewards/format_reward": 1.0, "step": 197 }, { "completion_length": 71.015625, "epoch": 2.3855421686746987, "grad_norm": 4.598411590922377, "kl": 0.09716796875, "learning_rate": 6.024096385542169e-07, "loss": 0.0039, "reward": 2.3405251502990723, "reward_std": 0.1668776124715805, "rewards/accuracy_reward": 1.3405250310897827, "rewards/format_reward": 1.0, "step": 198 }, { "completion_length": 72.0234375, "epoch": 2.397590361445783, "grad_norm": 4.094960420612339, "kl": 0.08447265625, "learning_rate": 6.004016064257028e-07, "loss": 0.0034, "reward": 2.2692129611968994, "reward_std": 0.22979120910167694, "rewards/accuracy_reward": 1.2848379015922546, "rewards/format_reward": 0.984375, "step": 199 }, { "completion_length": 76.34375, "epoch": 2.4096385542168672, "grad_norm": 5.228591551586785, "kl": 0.0771484375, "learning_rate": 5.983935742971888e-07, "loss": 0.0031, "reward": 2.29106342792511, "reward_std": 0.22756240516901016, "rewards/accuracy_reward": 1.2910634279251099, "rewards/format_reward": 1.0, "step": 200 }, { "completion_length": 79.3828125, "epoch": 2.4216867469879517, "grad_norm": 3.532651567007306, "kl": 0.140869140625, "learning_rate": 5.963855421686746e-07, "loss": 0.0056, "reward": 2.218053698539734, "reward_std": 0.24822543561458588, "rewards/accuracy_reward": 1.2180536985397339, "rewards/format_reward": 1.0, "step": 201 }, { "completion_length": 76.0, "epoch": 2.433734939759036, "grad_norm": 3.316768093202225, "kl": 0.088134765625, "learning_rate": 5.943775100401606e-07, "loss": 0.0035, "reward": 2.26613187789917, "reward_std": 0.24750088155269623, "rewards/accuracy_reward": 1.2739443182945251, "rewards/format_reward": 0.9921875, "step": 202 }, { "completion_length": 70.5234375, "epoch": 2.4457831325301207, "grad_norm": 9.031966519770473, "kl": 0.099853515625, "learning_rate": 5.923694779116466e-07, "loss": 0.004, "reward": 2.317081928253174, "reward_std": 0.24299181252717972, "rewards/accuracy_reward": 1.3248944282531738, "rewards/format_reward": 0.9921875, "step": 203 }, { "completion_length": 72.1484375, "epoch": 2.4578313253012047, "grad_norm": 4.923799185057533, "kl": 0.09716796875, "learning_rate": 5.903614457831325e-07, "loss": 0.0039, "reward": 2.202351689338684, "reward_std": 0.24287213385105133, "rewards/accuracy_reward": 1.2023517489433289, "rewards/format_reward": 1.0, "step": 204 }, { "completion_length": 75.5390625, "epoch": 2.4698795180722892, "grad_norm": 10.424209527328602, "kl": 0.0849609375, "learning_rate": 5.883534136546184e-07, "loss": 0.0034, "reward": 2.3431246280670166, "reward_std": 0.21441341936588287, "rewards/accuracy_reward": 1.3431245684623718, "rewards/format_reward": 1.0, "step": 205 }, { "completion_length": 74.1328125, "epoch": 2.4819277108433733, "grad_norm": 5.39794558294026, "kl": 0.08349609375, "learning_rate": 5.863453815261043e-07, "loss": 0.0033, "reward": 2.318004846572876, "reward_std": 0.1649407297372818, "rewards/accuracy_reward": 1.3180049657821655, "rewards/format_reward": 1.0, "step": 206 }, { "completion_length": 70.828125, "epoch": 2.4939759036144578, "grad_norm": 5.651509118393077, "kl": 0.099609375, "learning_rate": 5.843373493975904e-07, "loss": 0.004, "reward": 2.2745083570480347, "reward_std": 0.1795399785041809, "rewards/accuracy_reward": 1.27450829744339, "rewards/format_reward": 1.0, "step": 207 }, { "completion_length": 75.1484375, "epoch": 2.5060240963855422, "grad_norm": 3.374258945078158, "kl": 0.099853515625, "learning_rate": 5.823293172690763e-07, "loss": 0.004, "reward": 2.183190941810608, "reward_std": 0.19665208458900452, "rewards/accuracy_reward": 1.183190941810608, "rewards/format_reward": 1.0, "step": 208 }, { "completion_length": 75.15625, "epoch": 2.5180722891566267, "grad_norm": 3.680961209255419, "kl": 0.085693359375, "learning_rate": 5.803212851405623e-07, "loss": 0.0034, "reward": 2.3783202171325684, "reward_std": 0.21517369151115417, "rewards/accuracy_reward": 1.3861328959465027, "rewards/format_reward": 0.9921875, "step": 209 }, { "completion_length": 75.890625, "epoch": 2.5301204819277108, "grad_norm": 4.203577590596214, "kl": 0.093017578125, "learning_rate": 5.783132530120481e-07, "loss": 0.0037, "reward": 2.232303738594055, "reward_std": 0.21822457760572433, "rewards/accuracy_reward": 1.2401162385940552, "rewards/format_reward": 0.9921875, "step": 210 }, { "completion_length": 72.5234375, "epoch": 2.5421686746987953, "grad_norm": 5.049709537985753, "kl": 0.09033203125, "learning_rate": 5.76305220883534e-07, "loss": 0.0036, "reward": 2.3138071298599243, "reward_std": 0.18903522193431854, "rewards/accuracy_reward": 1.3138071298599243, "rewards/format_reward": 1.0, "step": 211 }, { "completion_length": 77.6796875, "epoch": 2.5542168674698793, "grad_norm": 4.79270453347689, "kl": 0.10791015625, "learning_rate": 5.742971887550201e-07, "loss": 0.0043, "reward": 2.35454523563385, "reward_std": 0.260717436671257, "rewards/accuracy_reward": 1.36235773563385, "rewards/format_reward": 0.9921875, "step": 212 }, { "completion_length": 75.5234375, "epoch": 2.566265060240964, "grad_norm": 3.8110594359613694, "kl": 0.132080078125, "learning_rate": 5.72289156626506e-07, "loss": 0.0053, "reward": 2.3396618366241455, "reward_std": 0.2776957154273987, "rewards/accuracy_reward": 1.3474743366241455, "rewards/format_reward": 0.9921875, "step": 213 }, { "completion_length": 78.8203125, "epoch": 2.5783132530120483, "grad_norm": 3.5277793226603467, "kl": 0.082763671875, "learning_rate": 5.70281124497992e-07, "loss": 0.0033, "reward": 2.282657027244568, "reward_std": 0.20082392543554306, "rewards/accuracy_reward": 1.2826570868492126, "rewards/format_reward": 1.0, "step": 214 }, { "completion_length": 79.7265625, "epoch": 2.5903614457831328, "grad_norm": 5.661825173466666, "kl": 0.070068359375, "learning_rate": 5.682730923694778e-07, "loss": 0.0028, "reward": 2.2916386127471924, "reward_std": 0.22843700647354126, "rewards/accuracy_reward": 1.2916386723518372, "rewards/format_reward": 1.0, "step": 215 }, { "completion_length": 75.484375, "epoch": 2.602409638554217, "grad_norm": 5.408656767411551, "kl": 0.074951171875, "learning_rate": 5.662650602409639e-07, "loss": 0.003, "reward": 2.4862678050994873, "reward_std": 0.17430586367845535, "rewards/accuracy_reward": 1.4862679243087769, "rewards/format_reward": 1.0, "step": 216 }, { "completion_length": 75.4140625, "epoch": 2.6144578313253013, "grad_norm": 4.437169209890788, "kl": 0.1123046875, "learning_rate": 5.642570281124498e-07, "loss": 0.0045, "reward": 2.2881970405578613, "reward_std": 0.24159938842058182, "rewards/accuracy_reward": 1.3116344809532166, "rewards/format_reward": 0.9765625, "step": 217 }, { "completion_length": 77.1484375, "epoch": 2.6265060240963853, "grad_norm": 3.7017405154535608, "kl": 0.0849609375, "learning_rate": 5.622489959839358e-07, "loss": 0.0034, "reward": 2.42057728767395, "reward_std": 0.1918034851551056, "rewards/accuracy_reward": 1.4205771684646606, "rewards/format_reward": 1.0, "step": 218 }, { "completion_length": 74.9921875, "epoch": 2.63855421686747, "grad_norm": 3.0572748613034184, "kl": 0.08056640625, "learning_rate": 5.602409638554216e-07, "loss": 0.0032, "reward": 2.296902298927307, "reward_std": 0.22776726633310318, "rewards/accuracy_reward": 1.2969022989273071, "rewards/format_reward": 1.0, "step": 219 }, { "completion_length": 77.9375, "epoch": 2.6506024096385543, "grad_norm": 5.142063259050984, "kl": 0.08251953125, "learning_rate": 5.582329317269075e-07, "loss": 0.0033, "reward": 2.411815643310547, "reward_std": 0.20656804740428925, "rewards/accuracy_reward": 1.4118155241012573, "rewards/format_reward": 1.0, "step": 220 }, { "completion_length": 75.0625, "epoch": 2.662650602409639, "grad_norm": 9.244315362233946, "kl": 0.094482421875, "learning_rate": 5.562248995983936e-07, "loss": 0.0038, "reward": 2.2525359392166138, "reward_std": 0.23683273047208786, "rewards/accuracy_reward": 1.2681609392166138, "rewards/format_reward": 0.984375, "step": 221 }, { "completion_length": 78.390625, "epoch": 2.674698795180723, "grad_norm": 4.89406748105177, "kl": 0.078125, "learning_rate": 5.542168674698795e-07, "loss": 0.0031, "reward": 2.33753764629364, "reward_std": 0.21247170120477676, "rewards/accuracy_reward": 1.3453501462936401, "rewards/format_reward": 0.9921875, "step": 222 }, { "completion_length": 73.0859375, "epoch": 2.6867469879518073, "grad_norm": 3.6393688137680464, "kl": 0.0810546875, "learning_rate": 5.522088353413655e-07, "loss": 0.0032, "reward": 2.2808330059051514, "reward_std": 0.1841505616903305, "rewards/accuracy_reward": 1.280833125114441, "rewards/format_reward": 1.0, "step": 223 }, { "completion_length": 77.1484375, "epoch": 2.6987951807228914, "grad_norm": 2.9614100491209516, "kl": 0.08447265625, "learning_rate": 5.502008032128513e-07, "loss": 0.0034, "reward": 2.256025791168213, "reward_std": 0.22689195722341537, "rewards/accuracy_reward": 1.271650791168213, "rewards/format_reward": 0.984375, "step": 224 }, { "completion_length": 72.6015625, "epoch": 2.710843373493976, "grad_norm": 4.624802749562738, "kl": 0.0810546875, "learning_rate": 5.481927710843374e-07, "loss": 0.0032, "reward": 2.367666721343994, "reward_std": 0.20605457574129105, "rewards/accuracy_reward": 1.367666482925415, "rewards/format_reward": 1.0, "step": 225 }, { "completion_length": 70.859375, "epoch": 2.7228915662650603, "grad_norm": 6.0943428059060505, "kl": 0.10205078125, "learning_rate": 5.461847389558233e-07, "loss": 0.0041, "reward": 2.3246583938598633, "reward_std": 0.17254704982042313, "rewards/accuracy_reward": 1.3324708938598633, "rewards/format_reward": 0.9921875, "step": 226 }, { "completion_length": 75.640625, "epoch": 2.734939759036145, "grad_norm": 4.26546660385252, "kl": 0.090087890625, "learning_rate": 5.441767068273092e-07, "loss": 0.0036, "reward": 2.307809591293335, "reward_std": 0.2002812698483467, "rewards/accuracy_reward": 1.315622091293335, "rewards/format_reward": 0.9921875, "step": 227 }, { "completion_length": 73.671875, "epoch": 2.746987951807229, "grad_norm": 3.4690497244218435, "kl": 0.0927734375, "learning_rate": 5.421686746987951e-07, "loss": 0.0037, "reward": 2.4064533710479736, "reward_std": 0.1763758659362793, "rewards/accuracy_reward": 1.4142658710479736, "rewards/format_reward": 0.9921875, "step": 228 }, { "completion_length": 77.265625, "epoch": 2.7590361445783134, "grad_norm": 3.8015660942675313, "kl": 0.107666015625, "learning_rate": 5.401606425702811e-07, "loss": 0.0043, "reward": 2.417749524116516, "reward_std": 0.20080577582120895, "rewards/accuracy_reward": 1.4333745837211609, "rewards/format_reward": 0.984375, "step": 229 }, { "completion_length": 78.6484375, "epoch": 2.7710843373493974, "grad_norm": 4.593078230781537, "kl": 0.081298828125, "learning_rate": 5.381526104417671e-07, "loss": 0.0032, "reward": 2.310904383659363, "reward_std": 0.20601534098386765, "rewards/accuracy_reward": 1.326529324054718, "rewards/format_reward": 0.984375, "step": 230 }, { "completion_length": 69.75, "epoch": 2.783132530120482, "grad_norm": 4.781119598148597, "kl": 0.092041015625, "learning_rate": 5.36144578313253e-07, "loss": 0.0037, "reward": 2.4060455560684204, "reward_std": 0.1945626586675644, "rewards/accuracy_reward": 1.41385817527771, "rewards/format_reward": 0.9921875, "step": 231 }, { "completion_length": 72.125, "epoch": 2.7951807228915664, "grad_norm": 3.6431689651666925, "kl": 0.084716796875, "learning_rate": 5.34136546184739e-07, "loss": 0.0034, "reward": 2.2687569856643677, "reward_std": 0.20781449228525162, "rewards/accuracy_reward": 1.2765693664550781, "rewards/format_reward": 0.9921875, "step": 232 }, { "completion_length": 75.28125, "epoch": 2.807228915662651, "grad_norm": 3.463525581618983, "kl": 0.0830078125, "learning_rate": 5.321285140562248e-07, "loss": 0.0033, "reward": 2.2786985635757446, "reward_std": 0.1869373545050621, "rewards/accuracy_reward": 1.2865110039710999, "rewards/format_reward": 0.9921875, "step": 233 }, { "completion_length": 72.390625, "epoch": 2.819277108433735, "grad_norm": 3.989550051539227, "kl": 0.08935546875, "learning_rate": 5.301204819277109e-07, "loss": 0.0036, "reward": 2.2122349739074707, "reward_std": 0.17366793006658554, "rewards/accuracy_reward": 1.212234914302826, "rewards/format_reward": 1.0, "step": 234 }, { "completion_length": 68.4296875, "epoch": 2.8313253012048194, "grad_norm": 5.293732432179004, "kl": 0.1162109375, "learning_rate": 5.281124497991968e-07, "loss": 0.0046, "reward": 2.273004412651062, "reward_std": 0.21551835536956787, "rewards/accuracy_reward": 1.2730044722557068, "rewards/format_reward": 1.0, "step": 235 }, { "completion_length": 70.4765625, "epoch": 2.8433734939759034, "grad_norm": 3.483964465031993, "kl": 0.08642578125, "learning_rate": 5.261044176706827e-07, "loss": 0.0035, "reward": 2.5097464323043823, "reward_std": 0.21660751849412918, "rewards/accuracy_reward": 1.509746491909027, "rewards/format_reward": 1.0, "step": 236 }, { "completion_length": 67.1796875, "epoch": 2.855421686746988, "grad_norm": 3.2613871176315286, "kl": 0.109619140625, "learning_rate": 5.240963855421686e-07, "loss": 0.0044, "reward": 2.2154468297958374, "reward_std": 0.2426525428891182, "rewards/accuracy_reward": 1.2154468894004822, "rewards/format_reward": 1.0, "step": 237 }, { "completion_length": 73.875, "epoch": 2.8674698795180724, "grad_norm": 5.04569953866162, "kl": 0.105224609375, "learning_rate": 5.220883534136546e-07, "loss": 0.0042, "reward": 2.3947439193725586, "reward_std": 0.16551193594932556, "rewards/accuracy_reward": 1.3947439193725586, "rewards/format_reward": 1.0, "step": 238 }, { "completion_length": 70.03125, "epoch": 2.8795180722891565, "grad_norm": 3.2080049289623997, "kl": 0.10986328125, "learning_rate": 5.200803212851406e-07, "loss": 0.0044, "reward": 2.394848346710205, "reward_std": 0.22504138201475143, "rewards/accuracy_reward": 1.394848346710205, "rewards/format_reward": 1.0, "step": 239 }, { "completion_length": 70.90625, "epoch": 2.891566265060241, "grad_norm": 3.843192487462901, "kl": 0.1171875, "learning_rate": 5.180722891566265e-07, "loss": 0.0047, "reward": 2.2219191789627075, "reward_std": 0.2526251822710037, "rewards/accuracy_reward": 1.2219191193580627, "rewards/format_reward": 1.0, "step": 240 }, { "completion_length": 67.1328125, "epoch": 2.9036144578313254, "grad_norm": 3.0217979987505394, "kl": 0.104248046875, "learning_rate": 5.160642570281125e-07, "loss": 0.0042, "reward": 2.2357059717178345, "reward_std": 0.181558758020401, "rewards/accuracy_reward": 1.235705852508545, "rewards/format_reward": 1.0, "step": 241 }, { "completion_length": 67.0390625, "epoch": 2.9156626506024095, "grad_norm": 4.171949473201647, "kl": 0.1044921875, "learning_rate": 5.140562248995983e-07, "loss": 0.0042, "reward": 2.3148874044418335, "reward_std": 0.17748098075389862, "rewards/accuracy_reward": 1.3148874640464783, "rewards/format_reward": 1.0, "step": 242 }, { "completion_length": 65.8671875, "epoch": 2.927710843373494, "grad_norm": 8.908769866071971, "kl": 0.11181640625, "learning_rate": 5.120481927710843e-07, "loss": 0.0045, "reward": 2.2218422889709473, "reward_std": 0.1961566060781479, "rewards/accuracy_reward": 1.2296549081802368, "rewards/format_reward": 0.9921875, "step": 243 }, { "completion_length": 63.6953125, "epoch": 2.9397590361445785, "grad_norm": 12.929344924116855, "kl": 0.106201171875, "learning_rate": 5.100401606425703e-07, "loss": 0.0042, "reward": 2.4831990003585815, "reward_std": 0.17936265468597412, "rewards/accuracy_reward": 1.4831989407539368, "rewards/format_reward": 1.0, "step": 244 }, { "completion_length": 62.28125, "epoch": 2.9518072289156625, "grad_norm": 3.4705083145900404, "kl": 0.111328125, "learning_rate": 5.080321285140562e-07, "loss": 0.0044, "reward": 2.352734327316284, "reward_std": 0.2174607664346695, "rewards/accuracy_reward": 1.3683592081069946, "rewards/format_reward": 0.984375, "step": 245 }, { "completion_length": 69.640625, "epoch": 2.963855421686747, "grad_norm": 4.178352503452598, "kl": 0.111572265625, "learning_rate": 5.060240963855421e-07, "loss": 0.0045, "reward": 2.3825145959854126, "reward_std": 0.21491926908493042, "rewards/accuracy_reward": 1.3903270959854126, "rewards/format_reward": 0.9921875, "step": 246 }, { "completion_length": 65.875, "epoch": 2.9759036144578315, "grad_norm": 4.426857679190133, "kl": 0.149169921875, "learning_rate": 5.040160642570281e-07, "loss": 0.006, "reward": 2.1721856594085693, "reward_std": 0.2390434294939041, "rewards/accuracy_reward": 1.1721857190132141, "rewards/format_reward": 1.0, "step": 247 }, { "completion_length": 70.9921875, "epoch": 2.9879518072289155, "grad_norm": 4.720913912936636, "kl": 0.114013671875, "learning_rate": 5.020080321285141e-07, "loss": 0.0046, "reward": 2.2051347494125366, "reward_std": 0.2722553163766861, "rewards/accuracy_reward": 1.2285721898078918, "rewards/format_reward": 0.9765625, "step": 248 }, { "completion_length": 64.25000190734863, "epoch": 3.0, "grad_norm": 3.5181266600609904, "kl": 0.11962890625, "learning_rate": 5e-07, "loss": 0.0048, "reward": 2.1161320209503174, "reward_std": 0.430472195148468, "rewards/accuracy_reward": 1.1994653940200806, "rewards/format_reward": 0.9166666865348816, "step": 249 }, { "completion_length": 68.1875, "epoch": 3.0120481927710845, "grad_norm": 3.5431810235066643, "kl": 0.09619140625, "learning_rate": 4.979919678714859e-07, "loss": 0.0038, "reward": 2.323817491531372, "reward_std": 0.23299024999141693, "rewards/accuracy_reward": 1.3316298723220825, "rewards/format_reward": 0.9921875, "step": 250 }, { "completion_length": 71.6953125, "epoch": 3.0240963855421685, "grad_norm": 3.3542739826451173, "kl": 0.08642578125, "learning_rate": 4.959839357429718e-07, "loss": 0.0035, "reward": 2.411439895629883, "reward_std": 0.19917739927768707, "rewards/accuracy_reward": 1.4114398956298828, "rewards/format_reward": 1.0, "step": 251 }, { "completion_length": 68.109375, "epoch": 3.036144578313253, "grad_norm": 12.151823073672764, "kl": 0.110107421875, "learning_rate": 4.939759036144578e-07, "loss": 0.0044, "reward": 2.5318474769592285, "reward_std": 0.18056734651327133, "rewards/accuracy_reward": 1.5396599173545837, "rewards/format_reward": 0.9921875, "step": 252 }, { "completion_length": 72.578125, "epoch": 3.0481927710843375, "grad_norm": 3.219943316402962, "kl": 0.099853515625, "learning_rate": 4.919678714859438e-07, "loss": 0.004, "reward": 2.3200578689575195, "reward_std": 0.15618911385536194, "rewards/accuracy_reward": 1.3200578689575195, "rewards/format_reward": 1.0, "step": 253 }, { "completion_length": 61.3828125, "epoch": 3.0602409638554215, "grad_norm": 3.865556225897638, "kl": 0.10888671875, "learning_rate": 4.899598393574297e-07, "loss": 0.0044, "reward": 2.209138035774231, "reward_std": 0.17473262548446655, "rewards/accuracy_reward": 1.2091379761695862, "rewards/format_reward": 1.0, "step": 254 }, { "completion_length": 66.7421875, "epoch": 3.072289156626506, "grad_norm": 4.017362101946035, "kl": 0.1259765625, "learning_rate": 4.879518072289156e-07, "loss": 0.005, "reward": 2.139701724052429, "reward_std": 0.22376088798046112, "rewards/accuracy_reward": 1.1397016048431396, "rewards/format_reward": 1.0, "step": 255 }, { "completion_length": 62.71875, "epoch": 3.0843373493975905, "grad_norm": 3.4288754746391947, "kl": 0.140625, "learning_rate": 4.859437751004016e-07, "loss": 0.0056, "reward": 2.2105259895324707, "reward_std": 0.22984497249126434, "rewards/accuracy_reward": 1.2261508703231812, "rewards/format_reward": 0.984375, "step": 256 }, { "completion_length": 66.6953125, "epoch": 3.0963855421686746, "grad_norm": 3.481985490355864, "kl": 0.1181640625, "learning_rate": 4.839357429718875e-07, "loss": 0.0047, "reward": 2.5049203634262085, "reward_std": 0.1857297122478485, "rewards/accuracy_reward": 1.5049203634262085, "rewards/format_reward": 1.0, "step": 257 }, { "completion_length": 67.484375, "epoch": 3.108433734939759, "grad_norm": 3.6977753194922403, "kl": 0.107666015625, "learning_rate": 4.819277108433735e-07, "loss": 0.0043, "reward": 2.3002774715423584, "reward_std": 0.21863283962011337, "rewards/accuracy_reward": 1.3080899119377136, "rewards/format_reward": 0.9921875, "step": 258 }, { "completion_length": 71.984375, "epoch": 3.1204819277108435, "grad_norm": 3.2391554999759054, "kl": 0.099853515625, "learning_rate": 4.799196787148594e-07, "loss": 0.004, "reward": 2.404132843017578, "reward_std": 0.19443362206220627, "rewards/accuracy_reward": 1.4119452238082886, "rewards/format_reward": 0.9921875, "step": 259 }, { "completion_length": 70.3984375, "epoch": 3.1325301204819276, "grad_norm": 3.8470897735347993, "kl": 0.11181640625, "learning_rate": 4.779116465863453e-07, "loss": 0.0045, "reward": 2.2314306497573853, "reward_std": 0.1860732138156891, "rewards/accuracy_reward": 1.2392430305480957, "rewards/format_reward": 0.9921875, "step": 260 }, { "completion_length": 71.7109375, "epoch": 3.144578313253012, "grad_norm": 5.7256880192839965, "kl": 0.101806640625, "learning_rate": 4.7590361445783126e-07, "loss": 0.0041, "reward": 2.3397083282470703, "reward_std": 0.21985551714897156, "rewards/accuracy_reward": 1.3397083282470703, "rewards/format_reward": 1.0, "step": 261 }, { "completion_length": 72.7265625, "epoch": 3.1566265060240966, "grad_norm": 4.6788843643036255, "kl": 0.183837890625, "learning_rate": 4.7389558232931724e-07, "loss": 0.0074, "reward": 2.288654088973999, "reward_std": 0.25063957273960114, "rewards/accuracy_reward": 1.296466588973999, "rewards/format_reward": 0.9921875, "step": 262 }, { "completion_length": 66.96875, "epoch": 3.1686746987951806, "grad_norm": 4.000735227178484, "kl": 0.1171875, "learning_rate": 4.7188755020080317e-07, "loss": 0.0047, "reward": 2.385547637939453, "reward_std": 0.179743941873312, "rewards/accuracy_reward": 1.393360197544098, "rewards/format_reward": 0.9921875, "step": 263 }, { "completion_length": 73.078125, "epoch": 3.180722891566265, "grad_norm": 3.2436175706744903, "kl": 0.08837890625, "learning_rate": 4.6987951807228915e-07, "loss": 0.0035, "reward": 2.3714927434921265, "reward_std": 0.1866167113184929, "rewards/accuracy_reward": 1.3793052434921265, "rewards/format_reward": 0.9921875, "step": 264 }, { "completion_length": 67.7578125, "epoch": 3.1927710843373496, "grad_norm": 4.16773338040152, "kl": 0.09619140625, "learning_rate": 4.678714859437751e-07, "loss": 0.0038, "reward": 2.256360650062561, "reward_std": 0.2188187688589096, "rewards/accuracy_reward": 1.256360650062561, "rewards/format_reward": 1.0, "step": 265 }, { "completion_length": 71.6796875, "epoch": 3.2048192771084336, "grad_norm": 3.7554898641141388, "kl": 0.094482421875, "learning_rate": 4.6586345381526106e-07, "loss": 0.0038, "reward": 2.285356283187866, "reward_std": 0.2733229324221611, "rewards/accuracy_reward": 1.2853562831878662, "rewards/format_reward": 1.0, "step": 266 }, { "completion_length": 69.53125, "epoch": 3.216867469879518, "grad_norm": 3.1396081677261747, "kl": 0.11572265625, "learning_rate": 4.63855421686747e-07, "loss": 0.0046, "reward": 2.194140672683716, "reward_std": 0.2116081416606903, "rewards/accuracy_reward": 1.1941407322883606, "rewards/format_reward": 1.0, "step": 267 }, { "completion_length": 67.8203125, "epoch": 3.2289156626506026, "grad_norm": 7.260439555595242, "kl": 0.08837890625, "learning_rate": 4.6184738955823296e-07, "loss": 0.0035, "reward": 2.252182364463806, "reward_std": 0.1803755983710289, "rewards/accuracy_reward": 1.259994924068451, "rewards/format_reward": 0.9921875, "step": 268 }, { "completion_length": 67.390625, "epoch": 3.2409638554216866, "grad_norm": 3.5049860895757696, "kl": 0.08935546875, "learning_rate": 4.5983935742971884e-07, "loss": 0.0036, "reward": 2.2208237648010254, "reward_std": 0.23105446994304657, "rewards/accuracy_reward": 1.2286362648010254, "rewards/format_reward": 0.9921875, "step": 269 }, { "completion_length": 70.8515625, "epoch": 3.253012048192771, "grad_norm": 5.489156591080696, "kl": 0.131591796875, "learning_rate": 4.5783132530120476e-07, "loss": 0.0053, "reward": 2.2373805046081543, "reward_std": 0.2680865153670311, "rewards/accuracy_reward": 1.2373805046081543, "rewards/format_reward": 1.0, "step": 270 }, { "completion_length": 67.3359375, "epoch": 3.2650602409638556, "grad_norm": 3.943203757539833, "kl": 0.102783203125, "learning_rate": 4.5582329317269074e-07, "loss": 0.0041, "reward": 2.2856905460357666, "reward_std": 0.2643607556819916, "rewards/accuracy_reward": 1.2856906652450562, "rewards/format_reward": 1.0, "step": 271 }, { "completion_length": 76.703125, "epoch": 3.2771084337349397, "grad_norm": 4.067837029288379, "kl": 0.14794921875, "learning_rate": 4.5381526104417667e-07, "loss": 0.0059, "reward": 2.2173361778259277, "reward_std": 0.23457611352205276, "rewards/accuracy_reward": 1.2251486778259277, "rewards/format_reward": 0.9921875, "step": 272 }, { "completion_length": 70.9765625, "epoch": 3.289156626506024, "grad_norm": 3.356513487854019, "kl": 0.105712890625, "learning_rate": 4.5180722891566265e-07, "loss": 0.0042, "reward": 2.3274762630462646, "reward_std": 0.1404755339026451, "rewards/accuracy_reward": 1.327476143836975, "rewards/format_reward": 1.0, "step": 273 }, { "completion_length": 73.5546875, "epoch": 3.3012048192771086, "grad_norm": 2.8662666869018194, "kl": 0.087646484375, "learning_rate": 4.497991967871486e-07, "loss": 0.0035, "reward": 2.4234249591827393, "reward_std": 0.23345230519771576, "rewards/accuracy_reward": 1.4234249591827393, "rewards/format_reward": 1.0, "step": 274 }, { "completion_length": 76.2890625, "epoch": 3.3132530120481927, "grad_norm": 3.6359732134875027, "kl": 0.0849609375, "learning_rate": 4.4779116465863456e-07, "loss": 0.0034, "reward": 2.2799594402313232, "reward_std": 0.17667143046855927, "rewards/accuracy_reward": 1.2799595594406128, "rewards/format_reward": 1.0, "step": 275 }, { "completion_length": 74.9296875, "epoch": 3.325301204819277, "grad_norm": 3.4769457078888513, "kl": 0.1181640625, "learning_rate": 4.4578313253012043e-07, "loss": 0.0047, "reward": 2.282673478126526, "reward_std": 0.20452508330345154, "rewards/accuracy_reward": 1.282673418521881, "rewards/format_reward": 1.0, "step": 276 }, { "completion_length": 73.828125, "epoch": 3.337349397590361, "grad_norm": 5.230024279024117, "kl": 0.0830078125, "learning_rate": 4.437751004016064e-07, "loss": 0.0033, "reward": 2.2097089290618896, "reward_std": 0.22180304676294327, "rewards/accuracy_reward": 1.2097087502479553, "rewards/format_reward": 1.0, "step": 277 }, { "completion_length": 72.7109375, "epoch": 3.3493975903614457, "grad_norm": 3.8728422379908416, "kl": 0.095458984375, "learning_rate": 4.4176706827309234e-07, "loss": 0.0038, "reward": 2.491241931915283, "reward_std": 0.22739917039871216, "rewards/accuracy_reward": 1.4912420511245728, "rewards/format_reward": 1.0, "step": 278 }, { "completion_length": 78.5078125, "epoch": 3.36144578313253, "grad_norm": 3.6858021846036535, "kl": 0.0908203125, "learning_rate": 4.3975903614457827e-07, "loss": 0.0036, "reward": 2.243127226829529, "reward_std": 0.22939348965883255, "rewards/accuracy_reward": 1.2431272268295288, "rewards/format_reward": 1.0, "step": 279 }, { "completion_length": 72.765625, "epoch": 3.3734939759036147, "grad_norm": 4.156042584491376, "kl": 0.1044921875, "learning_rate": 4.3775100401606425e-07, "loss": 0.0042, "reward": 2.2150485515594482, "reward_std": 0.23025363683700562, "rewards/accuracy_reward": 1.2228610515594482, "rewards/format_reward": 0.9921875, "step": 280 }, { "completion_length": 77.0390625, "epoch": 3.3855421686746987, "grad_norm": 3.3549823921313475, "kl": 0.100341796875, "learning_rate": 4.3574297188755017e-07, "loss": 0.004, "reward": 2.211505889892578, "reward_std": 0.24677567183971405, "rewards/accuracy_reward": 1.227130949497223, "rewards/format_reward": 0.984375, "step": 281 }, { "completion_length": 78.296875, "epoch": 3.397590361445783, "grad_norm": 3.5036767872389514, "kl": 0.0859375, "learning_rate": 4.3373493975903615e-07, "loss": 0.0034, "reward": 2.346588611602783, "reward_std": 0.20112959295511246, "rewards/accuracy_reward": 1.3465884923934937, "rewards/format_reward": 1.0, "step": 282 }, { "completion_length": 84.484375, "epoch": 3.4096385542168672, "grad_norm": 3.0794227415803874, "kl": 0.09326171875, "learning_rate": 4.3172690763052203e-07, "loss": 0.0037, "reward": 2.230928421020508, "reward_std": 0.26287955790758133, "rewards/accuracy_reward": 1.2387409210205078, "rewards/format_reward": 0.9921875, "step": 283 }, { "completion_length": 84.0546875, "epoch": 3.4216867469879517, "grad_norm": 9.632017573370238, "kl": 0.086181640625, "learning_rate": 4.29718875502008e-07, "loss": 0.0034, "reward": 2.2049087285995483, "reward_std": 0.19046999514102936, "rewards/accuracy_reward": 1.204908847808838, "rewards/format_reward": 1.0, "step": 284 }, { "completion_length": 74.875, "epoch": 3.433734939759036, "grad_norm": 3.04437077789607, "kl": 0.07861328125, "learning_rate": 4.2771084337349393e-07, "loss": 0.0031, "reward": 2.3966974020004272, "reward_std": 0.1937796175479889, "rewards/accuracy_reward": 1.3966973423957825, "rewards/format_reward": 1.0, "step": 285 }, { "completion_length": 75.8359375, "epoch": 3.4457831325301207, "grad_norm": 5.311045139915637, "kl": 0.163330078125, "learning_rate": 4.257028112449799e-07, "loss": 0.0065, "reward": 2.3752543926239014, "reward_std": 0.2273067831993103, "rewards/accuracy_reward": 1.3830668926239014, "rewards/format_reward": 0.9921875, "step": 286 }, { "completion_length": 78.6328125, "epoch": 3.4578313253012047, "grad_norm": 3.0911678350526763, "kl": 0.082763671875, "learning_rate": 4.2369477911646584e-07, "loss": 0.0033, "reward": 2.3473113775253296, "reward_std": 0.14994988590478897, "rewards/accuracy_reward": 1.3473113775253296, "rewards/format_reward": 1.0, "step": 287 }, { "completion_length": 79.1640625, "epoch": 3.4698795180722892, "grad_norm": 3.5847413181475947, "kl": 0.0849609375, "learning_rate": 4.216867469879518e-07, "loss": 0.0034, "reward": 2.433477997779846, "reward_std": 0.1769290268421173, "rewards/accuracy_reward": 1.4334778785705566, "rewards/format_reward": 1.0, "step": 288 }, { "completion_length": 83.390625, "epoch": 3.4819277108433733, "grad_norm": 4.01569190307187, "kl": 0.09521484375, "learning_rate": 4.1967871485943775e-07, "loss": 0.0038, "reward": 2.2789034843444824, "reward_std": 0.2845103293657303, "rewards/accuracy_reward": 1.2867161631584167, "rewards/format_reward": 0.9921875, "step": 289 }, { "completion_length": 81.90625, "epoch": 3.4939759036144578, "grad_norm": 3.286849126987869, "kl": 0.08642578125, "learning_rate": 4.176706827309237e-07, "loss": 0.0035, "reward": 2.362874150276184, "reward_std": 0.19387810677289963, "rewards/accuracy_reward": 1.362874150276184, "rewards/format_reward": 1.0, "step": 290 }, { "completion_length": 82.6640625, "epoch": 3.5060240963855422, "grad_norm": 3.658103173473351, "kl": 0.10888671875, "learning_rate": 4.156626506024096e-07, "loss": 0.0043, "reward": 2.0810331106185913, "reward_std": 0.3057002127170563, "rewards/accuracy_reward": 1.088845670223236, "rewards/format_reward": 0.9921875, "step": 291 }, { "completion_length": 78.921875, "epoch": 3.5180722891566267, "grad_norm": 3.7103596490236774, "kl": 0.08349609375, "learning_rate": 4.1365461847389553e-07, "loss": 0.0033, "reward": 2.511967420578003, "reward_std": 0.16890805214643478, "rewards/accuracy_reward": 1.5119673609733582, "rewards/format_reward": 1.0, "step": 292 }, { "completion_length": 79.0703125, "epoch": 3.5301204819277108, "grad_norm": 4.407185593870522, "kl": 0.099853515625, "learning_rate": 4.116465863453815e-07, "loss": 0.004, "reward": 2.298495650291443, "reward_std": 0.18783311545848846, "rewards/accuracy_reward": 1.2984956502914429, "rewards/format_reward": 1.0, "step": 293 }, { "completion_length": 77.796875, "epoch": 3.5421686746987953, "grad_norm": 4.826014110118868, "kl": 0.09814453125, "learning_rate": 4.0963855421686744e-07, "loss": 0.0039, "reward": 2.2871015071868896, "reward_std": 0.2442024052143097, "rewards/accuracy_reward": 1.2871016263961792, "rewards/format_reward": 1.0, "step": 294 }, { "completion_length": 81.0390625, "epoch": 3.5542168674698793, "grad_norm": 5.044218587715949, "kl": 0.1220703125, "learning_rate": 4.076305220883534e-07, "loss": 0.0049, "reward": 2.3120492696762085, "reward_std": 0.26864828169345856, "rewards/accuracy_reward": 1.3198617696762085, "rewards/format_reward": 0.9921875, "step": 295 }, { "completion_length": 81.8046875, "epoch": 3.566265060240964, "grad_norm": 4.035337217053536, "kl": 0.102783203125, "learning_rate": 4.0562248995983934e-07, "loss": 0.0041, "reward": 2.2244678735733032, "reward_std": 0.19216852635145187, "rewards/accuracy_reward": 1.2244678139686584, "rewards/format_reward": 1.0, "step": 296 }, { "completion_length": 82.1875, "epoch": 3.5783132530120483, "grad_norm": 5.473424541297646, "kl": 0.082275390625, "learning_rate": 4.036144578313253e-07, "loss": 0.0033, "reward": 2.1482508182525635, "reward_std": 0.2517557144165039, "rewards/accuracy_reward": 1.1560633182525635, "rewards/format_reward": 0.9921875, "step": 297 }, { "completion_length": 76.8828125, "epoch": 3.5903614457831328, "grad_norm": 3.624065660089473, "kl": 0.099609375, "learning_rate": 4.0160642570281125e-07, "loss": 0.004, "reward": 2.460606813430786, "reward_std": 0.20688265562057495, "rewards/accuracy_reward": 1.476231873035431, "rewards/format_reward": 0.984375, "step": 298 }, { "completion_length": 73.8828125, "epoch": 3.602409638554217, "grad_norm": 3.2496622555871775, "kl": 0.10302734375, "learning_rate": 3.995983935742971e-07, "loss": 0.0041, "reward": 2.448202967643738, "reward_std": 0.20513835549354553, "rewards/accuracy_reward": 1.4482029676437378, "rewards/format_reward": 1.0, "step": 299 }, { "completion_length": 73.8828125, "epoch": 3.6144578313253013, "grad_norm": 3.248403260656612, "kl": 0.1142578125, "learning_rate": 3.975903614457831e-07, "loss": 0.0046, "reward": 2.3579249382019043, "reward_std": 0.26106585562229156, "rewards/accuracy_reward": 1.3657374382019043, "rewards/format_reward": 0.9921875, "step": 300 }, { "completion_length": 81.78125, "epoch": 3.6265060240963853, "grad_norm": 4.192951592702023, "kl": 0.090087890625, "learning_rate": 3.9558232931726903e-07, "loss": 0.0036, "reward": 2.320730686187744, "reward_std": 0.17225497588515282, "rewards/accuracy_reward": 1.3207308053970337, "rewards/format_reward": 1.0, "step": 301 }, { "completion_length": 81.78125, "epoch": 3.63855421686747, "grad_norm": 3.914334064533718, "kl": 0.082763671875, "learning_rate": 3.93574297188755e-07, "loss": 0.0033, "reward": 2.2756303548812866, "reward_std": 0.21440081298351288, "rewards/accuracy_reward": 1.2834429144859314, "rewards/format_reward": 0.9921875, "step": 302 }, { "completion_length": 83.984375, "epoch": 3.6506024096385543, "grad_norm": 2.9158995310046705, "kl": 0.09326171875, "learning_rate": 3.9156626506024094e-07, "loss": 0.0037, "reward": 2.340207576751709, "reward_std": 0.22486132383346558, "rewards/accuracy_reward": 1.3402075171470642, "rewards/format_reward": 1.0, "step": 303 }, { "completion_length": 73.0078125, "epoch": 3.662650602409639, "grad_norm": 3.64523826351094, "kl": 0.130615234375, "learning_rate": 3.895582329317269e-07, "loss": 0.0052, "reward": 2.306045651435852, "reward_std": 0.21042678505182266, "rewards/accuracy_reward": 1.313858151435852, "rewards/format_reward": 0.9921875, "step": 304 }, { "completion_length": 77.140625, "epoch": 3.674698795180723, "grad_norm": 4.763683185347457, "kl": 0.09619140625, "learning_rate": 3.8755020080321285e-07, "loss": 0.0038, "reward": 2.292635202407837, "reward_std": 0.24200939387083054, "rewards/accuracy_reward": 1.308260202407837, "rewards/format_reward": 0.984375, "step": 305 }, { "completion_length": 80.6875, "epoch": 3.6867469879518073, "grad_norm": 15.378313149094321, "kl": 0.130126953125, "learning_rate": 3.8554216867469877e-07, "loss": 0.0052, "reward": 2.2641184329986572, "reward_std": 0.20184506475925446, "rewards/accuracy_reward": 1.2719308137893677, "rewards/format_reward": 0.9921875, "step": 306 }, { "completion_length": 72.4453125, "epoch": 3.6987951807228914, "grad_norm": 6.1838290298686225, "kl": 0.114501953125, "learning_rate": 3.835341365461847e-07, "loss": 0.0046, "reward": 2.4186692237854004, "reward_std": 0.20656991004943848, "rewards/accuracy_reward": 1.4264817833900452, "rewards/format_reward": 0.9921875, "step": 307 }, { "completion_length": 73.71875, "epoch": 3.710843373493976, "grad_norm": 3.6680281562358794, "kl": 0.092041015625, "learning_rate": 3.815261044176707e-07, "loss": 0.0037, "reward": 2.3598402738571167, "reward_std": 0.1814076155424118, "rewards/accuracy_reward": 1.3598402738571167, "rewards/format_reward": 1.0, "step": 308 }, { "completion_length": 75.5625, "epoch": 3.7228915662650603, "grad_norm": 4.1513164017455635, "kl": 0.11962890625, "learning_rate": 3.795180722891566e-07, "loss": 0.0048, "reward": 2.2364041805267334, "reward_std": 0.20799466967582703, "rewards/accuracy_reward": 1.236404299736023, "rewards/format_reward": 1.0, "step": 309 }, { "completion_length": 76.2109375, "epoch": 3.734939759036145, "grad_norm": 4.53835509987933, "kl": 0.088623046875, "learning_rate": 3.7751004016064253e-07, "loss": 0.0036, "reward": 2.3527251482009888, "reward_std": 0.17692391574382782, "rewards/accuracy_reward": 1.3527252078056335, "rewards/format_reward": 1.0, "step": 310 }, { "completion_length": 80.4375, "epoch": 3.746987951807229, "grad_norm": 3.703393707261026, "kl": 0.1103515625, "learning_rate": 3.755020080321285e-07, "loss": 0.0044, "reward": 2.298377275466919, "reward_std": 0.21109677106142044, "rewards/accuracy_reward": 1.2983773350715637, "rewards/format_reward": 1.0, "step": 311 }, { "completion_length": 77.8125, "epoch": 3.7590361445783134, "grad_norm": 3.914375784414754, "kl": 0.138916015625, "learning_rate": 3.7349397590361444e-07, "loss": 0.0056, "reward": 2.1520947217941284, "reward_std": 0.19967754930257797, "rewards/accuracy_reward": 1.1520947813987732, "rewards/format_reward": 1.0, "step": 312 }, { "completion_length": 79.2578125, "epoch": 3.7710843373493974, "grad_norm": 5.606330092523797, "kl": 0.091064453125, "learning_rate": 3.714859437751004e-07, "loss": 0.0036, "reward": 2.3204472064971924, "reward_std": 0.1748044565320015, "rewards/accuracy_reward": 1.3204472661018372, "rewards/format_reward": 1.0, "step": 313 }, { "completion_length": 74.84375, "epoch": 3.783132530120482, "grad_norm": 3.2348525038063736, "kl": 0.08447265625, "learning_rate": 3.694779116465863e-07, "loss": 0.0034, "reward": 2.496751070022583, "reward_std": 0.2072158306837082, "rewards/accuracy_reward": 1.496751070022583, "rewards/format_reward": 1.0, "step": 314 }, { "completion_length": 74.296875, "epoch": 3.7951807228915664, "grad_norm": 3.7371491385040483, "kl": 0.0771484375, "learning_rate": 3.674698795180723e-07, "loss": 0.0031, "reward": 2.395453691482544, "reward_std": 0.16877512633800507, "rewards/accuracy_reward": 1.3954537510871887, "rewards/format_reward": 1.0, "step": 315 }, { "completion_length": 72.8671875, "epoch": 3.807228915662651, "grad_norm": 5.799331345023467, "kl": 0.09619140625, "learning_rate": 3.654618473895582e-07, "loss": 0.0039, "reward": 2.307594895362854, "reward_std": 0.1985296756029129, "rewards/accuracy_reward": 1.307594895362854, "rewards/format_reward": 1.0, "step": 316 }, { "completion_length": 72.84375, "epoch": 3.819277108433735, "grad_norm": 5.215215330938529, "kl": 0.11083984375, "learning_rate": 3.634538152610442e-07, "loss": 0.0044, "reward": 2.2713290452957153, "reward_std": 0.15980049967765808, "rewards/accuracy_reward": 1.2791414856910706, "rewards/format_reward": 0.9921875, "step": 317 }, { "completion_length": 66.28125, "epoch": 3.8313253012048194, "grad_norm": 9.42828281313003, "kl": 0.106201171875, "learning_rate": 3.614457831325301e-07, "loss": 0.0042, "reward": 2.441011667251587, "reward_std": 0.21370699256658554, "rewards/accuracy_reward": 1.4566364884376526, "rewards/format_reward": 0.984375, "step": 318 }, { "completion_length": 74.3359375, "epoch": 3.8433734939759034, "grad_norm": 3.380164477319568, "kl": 0.094970703125, "learning_rate": 3.5943775100401604e-07, "loss": 0.0038, "reward": 2.5070927143096924, "reward_std": 0.16660126298666, "rewards/accuracy_reward": 1.5149051547050476, "rewards/format_reward": 0.9921875, "step": 319 }, { "completion_length": 71.3046875, "epoch": 3.855421686746988, "grad_norm": 4.006205885169367, "kl": 0.128662109375, "learning_rate": 3.57429718875502e-07, "loss": 0.0051, "reward": 2.3042829036712646, "reward_std": 0.2031613141298294, "rewards/accuracy_reward": 1.3042829036712646, "rewards/format_reward": 1.0, "step": 320 }, { "completion_length": 73.9609375, "epoch": 3.8674698795180724, "grad_norm": 5.771036516275782, "kl": 0.093017578125, "learning_rate": 3.554216867469879e-07, "loss": 0.0037, "reward": 2.422416090965271, "reward_std": 0.19139418005943298, "rewards/accuracy_reward": 1.4302285313606262, "rewards/format_reward": 0.9921875, "step": 321 }, { "completion_length": 71.734375, "epoch": 3.8795180722891565, "grad_norm": 5.860041479699707, "kl": 0.110595703125, "learning_rate": 3.5341365461847387e-07, "loss": 0.0044, "reward": 2.100473999977112, "reward_std": 0.21565508097410202, "rewards/accuracy_reward": 1.1004739999771118, "rewards/format_reward": 1.0, "step": 322 }, { "completion_length": 69.046875, "epoch": 3.891566265060241, "grad_norm": 4.962719097630754, "kl": 0.1396484375, "learning_rate": 3.514056224899598e-07, "loss": 0.0056, "reward": 2.337049961090088, "reward_std": 0.201468363404274, "rewards/accuracy_reward": 1.337049961090088, "rewards/format_reward": 1.0, "step": 323 }, { "completion_length": 70.0234375, "epoch": 3.9036144578313254, "grad_norm": 3.786778485554144, "kl": 0.1064453125, "learning_rate": 3.493975903614458e-07, "loss": 0.0043, "reward": 2.282514452934265, "reward_std": 0.2470734864473343, "rewards/accuracy_reward": 1.2903268933296204, "rewards/format_reward": 0.9921875, "step": 324 }, { "completion_length": 66.5546875, "epoch": 3.9156626506024095, "grad_norm": 5.681847770854111, "kl": 0.14599609375, "learning_rate": 3.473895582329317e-07, "loss": 0.0059, "reward": 2.2830464839935303, "reward_std": 0.16951018571853638, "rewards/accuracy_reward": 1.2830466032028198, "rewards/format_reward": 1.0, "step": 325 }, { "completion_length": 69.9765625, "epoch": 3.927710843373494, "grad_norm": 3.545177223680582, "kl": 0.1123046875, "learning_rate": 3.453815261044177e-07, "loss": 0.0045, "reward": 2.3249276876449585, "reward_std": 0.23469389975070953, "rewards/accuracy_reward": 1.3249276876449585, "rewards/format_reward": 1.0, "step": 326 }, { "completion_length": 67.2109375, "epoch": 3.9397590361445785, "grad_norm": 4.464381426334607, "kl": 0.111328125, "learning_rate": 3.433734939759036e-07, "loss": 0.0045, "reward": 2.313346743583679, "reward_std": 0.24960950016975403, "rewards/accuracy_reward": 1.321159303188324, "rewards/format_reward": 0.9921875, "step": 327 }, { "completion_length": 69.5390625, "epoch": 3.9518072289156625, "grad_norm": 5.503294892764904, "kl": 0.13818359375, "learning_rate": 3.413654618473896e-07, "loss": 0.0055, "reward": 2.250451922416687, "reward_std": 0.19627484679222107, "rewards/accuracy_reward": 1.2582644820213318, "rewards/format_reward": 0.9921875, "step": 328 }, { "completion_length": 72.875, "epoch": 3.963855421686747, "grad_norm": 3.94333602961405, "kl": 0.126953125, "learning_rate": 3.3935742971887547e-07, "loss": 0.0051, "reward": 2.4282917976379395, "reward_std": 0.23817364871501923, "rewards/accuracy_reward": 1.4361043572425842, "rewards/format_reward": 0.9921875, "step": 329 }, { "completion_length": 68.078125, "epoch": 3.9759036144578315, "grad_norm": 4.246221946155538, "kl": 0.10302734375, "learning_rate": 3.373493975903614e-07, "loss": 0.0041, "reward": 2.3756778240203857, "reward_std": 0.23032685369253159, "rewards/accuracy_reward": 1.3756778836250305, "rewards/format_reward": 1.0, "step": 330 }, { "completion_length": 63.171875, "epoch": 3.9879518072289155, "grad_norm": 4.823180720092978, "kl": 0.14111328125, "learning_rate": 3.353413654618474e-07, "loss": 0.0057, "reward": 2.2716495990753174, "reward_std": 0.25546562671661377, "rewards/accuracy_reward": 1.2794621586799622, "rewards/format_reward": 0.9921875, "step": 331 }, { "completion_length": 79.75000381469727, "epoch": 4.0, "grad_norm": 3.966089593429622, "kl": 0.10986328125, "learning_rate": 3.333333333333333e-07, "loss": 0.0047, "reward": 1.9844202995300293, "reward_std": 0.41577973030507565, "rewards/accuracy_reward": 0.9844204187393188, "rewards/format_reward": 1.0, "step": 332 }, { "completion_length": 67.8984375, "epoch": 4.0120481927710845, "grad_norm": 3.4890518846644203, "kl": 0.112548828125, "learning_rate": 3.313253012048193e-07, "loss": 0.0045, "reward": 2.273194432258606, "reward_std": 0.1845482587814331, "rewards/accuracy_reward": 1.2810069918632507, "rewards/format_reward": 0.9921875, "step": 333 }, { "completion_length": 70.1328125, "epoch": 4.024096385542169, "grad_norm": 3.1401475074211698, "kl": 0.106201171875, "learning_rate": 3.293172690763052e-07, "loss": 0.0042, "reward": 2.348654627799988, "reward_std": 0.20452319085597992, "rewards/accuracy_reward": 1.3564670085906982, "rewards/format_reward": 0.9921875, "step": 334 }, { "completion_length": 67.4296875, "epoch": 4.036144578313253, "grad_norm": 4.049959483426693, "kl": 0.107177734375, "learning_rate": 3.273092369477912e-07, "loss": 0.0043, "reward": 2.270454525947571, "reward_std": 0.21142029762268066, "rewards/accuracy_reward": 1.2704546451568604, "rewards/format_reward": 1.0, "step": 335 }, { "completion_length": 71.1484375, "epoch": 4.048192771084337, "grad_norm": 3.9561612834766273, "kl": 0.097412109375, "learning_rate": 3.2530120481927706e-07, "loss": 0.0039, "reward": 2.1833893060684204, "reward_std": 0.1801520176231861, "rewards/accuracy_reward": 1.1912018656730652, "rewards/format_reward": 0.9921875, "step": 336 }, { "completion_length": 69.59375, "epoch": 4.0602409638554215, "grad_norm": 3.977655100011985, "kl": 0.1474609375, "learning_rate": 3.2329317269076304e-07, "loss": 0.0059, "reward": 2.2047336101531982, "reward_std": 0.1999206244945526, "rewards/accuracy_reward": 1.204733669757843, "rewards/format_reward": 1.0, "step": 337 }, { "completion_length": 61.4765625, "epoch": 4.072289156626506, "grad_norm": 4.191698428231115, "kl": 0.12939453125, "learning_rate": 3.2128514056224897e-07, "loss": 0.0052, "reward": 2.3498200178146362, "reward_std": 0.2275300845503807, "rewards/accuracy_reward": 1.3498198986053467, "rewards/format_reward": 1.0, "step": 338 }, { "completion_length": 64.4140625, "epoch": 4.0843373493975905, "grad_norm": 3.9067810348739114, "kl": 0.116943359375, "learning_rate": 3.192771084337349e-07, "loss": 0.0047, "reward": 2.352308511734009, "reward_std": 0.22002745419740677, "rewards/accuracy_reward": 1.3523083925247192, "rewards/format_reward": 1.0, "step": 339 }, { "completion_length": 73.2890625, "epoch": 4.096385542168675, "grad_norm": 4.489032904646898, "kl": 0.104736328125, "learning_rate": 3.172690763052209e-07, "loss": 0.0042, "reward": 2.1710336208343506, "reward_std": 0.17718148604035378, "rewards/accuracy_reward": 1.1710334420204163, "rewards/format_reward": 1.0, "step": 340 }, { "completion_length": 74.3671875, "epoch": 4.108433734939759, "grad_norm": 4.230949730619595, "kl": 0.139892578125, "learning_rate": 3.152610441767068e-07, "loss": 0.0056, "reward": 2.084486246109009, "reward_std": 0.2170683741569519, "rewards/accuracy_reward": 1.0922988057136536, "rewards/format_reward": 0.9921875, "step": 341 }, { "completion_length": 65.5625, "epoch": 4.120481927710843, "grad_norm": 5.461293103432774, "kl": 0.1044921875, "learning_rate": 3.132530120481928e-07, "loss": 0.0042, "reward": 2.381394147872925, "reward_std": 0.193039670586586, "rewards/accuracy_reward": 1.38139408826828, "rewards/format_reward": 1.0, "step": 342 }, { "completion_length": 66.15625, "epoch": 4.132530120481928, "grad_norm": 4.070866693962467, "kl": 0.111572265625, "learning_rate": 3.112449799196787e-07, "loss": 0.0045, "reward": 2.357278347015381, "reward_std": 0.15215902030467987, "rewards/accuracy_reward": 1.3729035258293152, "rewards/format_reward": 0.984375, "step": 343 }, { "completion_length": 69.1328125, "epoch": 4.144578313253012, "grad_norm": 4.335873726549927, "kl": 0.123046875, "learning_rate": 3.0923694779116464e-07, "loss": 0.0049, "reward": 2.282222032546997, "reward_std": 0.25280918926000595, "rewards/accuracy_reward": 1.2978470921516418, "rewards/format_reward": 0.984375, "step": 344 }, { "completion_length": 73.6015625, "epoch": 4.156626506024097, "grad_norm": 4.412489990442917, "kl": 0.09765625, "learning_rate": 3.0722891566265056e-07, "loss": 0.0039, "reward": 2.421238660812378, "reward_std": 0.21779820322990417, "rewards/accuracy_reward": 1.4290512800216675, "rewards/format_reward": 0.9921875, "step": 345 }, { "completion_length": 67.3984375, "epoch": 4.168674698795181, "grad_norm": 3.7050619604015775, "kl": 0.111083984375, "learning_rate": 3.0522088353413654e-07, "loss": 0.0044, "reward": 2.4159966707229614, "reward_std": 0.17116259038448334, "rewards/accuracy_reward": 1.4159966707229614, "rewards/format_reward": 1.0, "step": 346 }, { "completion_length": 68.7109375, "epoch": 4.180722891566265, "grad_norm": 4.638840034522594, "kl": 0.119873046875, "learning_rate": 3.0321285140562247e-07, "loss": 0.0048, "reward": 2.430918335914612, "reward_std": 0.23829656839370728, "rewards/accuracy_reward": 1.4309183359146118, "rewards/format_reward": 1.0, "step": 347 }, { "completion_length": 68.203125, "epoch": 4.192771084337349, "grad_norm": 7.531973472034052, "kl": 0.124267578125, "learning_rate": 3.0120481927710845e-07, "loss": 0.005, "reward": 2.2654261589050293, "reward_std": 0.214869923889637, "rewards/accuracy_reward": 1.2966760993003845, "rewards/format_reward": 0.96875, "step": 348 }, { "completion_length": 66.3046875, "epoch": 4.204819277108434, "grad_norm": 6.290139006407989, "kl": 0.15673828125, "learning_rate": 2.991967871485944e-07, "loss": 0.0063, "reward": 2.440833330154419, "reward_std": 0.20570393651723862, "rewards/accuracy_reward": 1.4642709493637085, "rewards/format_reward": 0.9765625, "step": 349 }, { "completion_length": 68.5078125, "epoch": 4.216867469879518, "grad_norm": 3.870085506410607, "kl": 0.11376953125, "learning_rate": 2.971887550200803e-07, "loss": 0.0046, "reward": 2.4419082403182983, "reward_std": 0.1332126259803772, "rewards/accuracy_reward": 1.441908359527588, "rewards/format_reward": 1.0, "step": 350 }, { "completion_length": 67.7109375, "epoch": 4.228915662650603, "grad_norm": 5.222390077968289, "kl": 0.12548828125, "learning_rate": 2.9518072289156623e-07, "loss": 0.005, "reward": 2.354392647743225, "reward_std": 0.250136561691761, "rewards/accuracy_reward": 1.3700175285339355, "rewards/format_reward": 0.984375, "step": 351 }, { "completion_length": 63.75, "epoch": 4.240963855421687, "grad_norm": 5.7394258697520835, "kl": 0.13671875, "learning_rate": 2.9317269076305216e-07, "loss": 0.0055, "reward": 2.1846532821655273, "reward_std": 0.27685467153787613, "rewards/accuracy_reward": 1.2080907225608826, "rewards/format_reward": 0.9765625, "step": 352 }, { "completion_length": 68.734375, "epoch": 4.253012048192771, "grad_norm": 3.522967170920438, "kl": 0.10400390625, "learning_rate": 2.9116465863453814e-07, "loss": 0.0041, "reward": 2.315014600753784, "reward_std": 0.13816260546445847, "rewards/accuracy_reward": 1.3150146007537842, "rewards/format_reward": 1.0, "step": 353 }, { "completion_length": 72.8125, "epoch": 4.265060240963855, "grad_norm": 3.727859373676823, "kl": 0.12939453125, "learning_rate": 2.8915662650602407e-07, "loss": 0.0052, "reward": 2.206972360610962, "reward_std": 0.23467965424060822, "rewards/accuracy_reward": 1.2069722414016724, "rewards/format_reward": 1.0, "step": 354 }, { "completion_length": 70.3359375, "epoch": 4.27710843373494, "grad_norm": 3.380662774166939, "kl": 0.09716796875, "learning_rate": 2.8714859437751005e-07, "loss": 0.0039, "reward": 2.1916306018829346, "reward_std": 0.23339906334877014, "rewards/accuracy_reward": 1.2072556018829346, "rewards/format_reward": 0.984375, "step": 355 }, { "completion_length": 72.4375, "epoch": 4.289156626506024, "grad_norm": 3.5703829288777764, "kl": 0.11376953125, "learning_rate": 2.85140562248996e-07, "loss": 0.0046, "reward": 2.142443895339966, "reward_std": 0.2050827294588089, "rewards/accuracy_reward": 1.1580689549446106, "rewards/format_reward": 0.984375, "step": 356 }, { "completion_length": 66.9921875, "epoch": 4.301204819277109, "grad_norm": 3.6787951883313275, "kl": 0.119873046875, "learning_rate": 2.8313253012048195e-07, "loss": 0.0048, "reward": 2.6013587713241577, "reward_std": 0.17792491614818573, "rewards/accuracy_reward": 1.6013588309288025, "rewards/format_reward": 1.0, "step": 357 }, { "completion_length": 67.1875, "epoch": 4.313253012048193, "grad_norm": 7.9299540096420476, "kl": 0.111328125, "learning_rate": 2.811244979919679e-07, "loss": 0.0044, "reward": 2.2114800214767456, "reward_std": 0.2541910707950592, "rewards/accuracy_reward": 1.2271050810813904, "rewards/format_reward": 0.984375, "step": 358 }, { "completion_length": 69.1953125, "epoch": 4.325301204819277, "grad_norm": 3.7315177619787687, "kl": 0.10400390625, "learning_rate": 2.7911646586345376e-07, "loss": 0.0042, "reward": 2.2850147485733032, "reward_std": 0.24116653203964233, "rewards/accuracy_reward": 1.3084524869918823, "rewards/format_reward": 0.9765625, "step": 359 }, { "completion_length": 76.6640625, "epoch": 4.337349397590361, "grad_norm": 3.8031600707561886, "kl": 0.08984375, "learning_rate": 2.7710843373493974e-07, "loss": 0.0036, "reward": 2.372725009918213, "reward_std": 0.23598377406597137, "rewards/accuracy_reward": 1.380537509918213, "rewards/format_reward": 0.9921875, "step": 360 }, { "completion_length": 72.6015625, "epoch": 4.349397590361446, "grad_norm": 6.29903230301134, "kl": 0.10205078125, "learning_rate": 2.7510040160642566e-07, "loss": 0.0041, "reward": 2.3671088218688965, "reward_std": 0.21375955641269684, "rewards/accuracy_reward": 1.3749213814735413, "rewards/format_reward": 0.9921875, "step": 361 }, { "completion_length": 74.546875, "epoch": 4.36144578313253, "grad_norm": 4.5097271327174555, "kl": 0.100341796875, "learning_rate": 2.7309236947791164e-07, "loss": 0.004, "reward": 2.338581085205078, "reward_std": 0.21793486177921295, "rewards/accuracy_reward": 1.3463934063911438, "rewards/format_reward": 0.9921875, "step": 362 }, { "completion_length": 73.203125, "epoch": 4.373493975903615, "grad_norm": 7.563928087147195, "kl": 0.093505859375, "learning_rate": 2.7108433734939757e-07, "loss": 0.0037, "reward": 2.4811813831329346, "reward_std": 0.1661686971783638, "rewards/accuracy_reward": 1.4811814427375793, "rewards/format_reward": 1.0, "step": 363 }, { "completion_length": 72.2109375, "epoch": 4.385542168674699, "grad_norm": 4.157739455544304, "kl": 0.11767578125, "learning_rate": 2.6907630522088355e-07, "loss": 0.0047, "reward": 2.227518320083618, "reward_std": 0.2459297701716423, "rewards/accuracy_reward": 1.235330879688263, "rewards/format_reward": 0.9921875, "step": 364 }, { "completion_length": 73.125, "epoch": 4.397590361445783, "grad_norm": 3.957643739786318, "kl": 0.130126953125, "learning_rate": 2.670682730923695e-07, "loss": 0.0052, "reward": 2.398737668991089, "reward_std": 0.2508920058608055, "rewards/accuracy_reward": 1.406550109386444, "rewards/format_reward": 0.9921875, "step": 365 }, { "completion_length": 80.6484375, "epoch": 4.409638554216867, "grad_norm": 8.267939908268028, "kl": 0.126220703125, "learning_rate": 2.6506024096385546e-07, "loss": 0.005, "reward": 2.1884970664978027, "reward_std": 0.32723745703697205, "rewards/accuracy_reward": 1.2119346857070923, "rewards/format_reward": 0.9765625, "step": 366 }, { "completion_length": 80.09375, "epoch": 4.421686746987952, "grad_norm": 3.0023836541953988, "kl": 0.089111328125, "learning_rate": 2.6305220883534133e-07, "loss": 0.0036, "reward": 2.4019484519958496, "reward_std": 0.20879995077848434, "rewards/accuracy_reward": 1.4019483923912048, "rewards/format_reward": 1.0, "step": 367 }, { "completion_length": 76.890625, "epoch": 4.433734939759036, "grad_norm": 3.8760535577901916, "kl": 0.110107421875, "learning_rate": 2.610441767068273e-07, "loss": 0.0044, "reward": 2.217389702796936, "reward_std": 0.20581622421741486, "rewards/accuracy_reward": 1.225202202796936, "rewards/format_reward": 0.9921875, "step": 368 }, { "completion_length": 70.046875, "epoch": 4.445783132530121, "grad_norm": 4.189426211226252, "kl": 0.09912109375, "learning_rate": 2.5903614457831324e-07, "loss": 0.004, "reward": 2.3884357213974, "reward_std": 0.23216703534126282, "rewards/accuracy_reward": 1.4118732213974, "rewards/format_reward": 0.9765625, "step": 369 }, { "completion_length": 75.3125, "epoch": 4.457831325301205, "grad_norm": 3.5709834038432886, "kl": 0.112060546875, "learning_rate": 2.5702811244979916e-07, "loss": 0.0045, "reward": 2.4395360946655273, "reward_std": 0.25345855951309204, "rewards/accuracy_reward": 1.4551611542701721, "rewards/format_reward": 0.984375, "step": 370 }, { "completion_length": 76.03125, "epoch": 4.469879518072289, "grad_norm": 3.8012985013892897, "kl": 0.11962890625, "learning_rate": 2.5502008032128514e-07, "loss": 0.0048, "reward": 2.2614444494247437, "reward_std": 0.25984859466552734, "rewards/accuracy_reward": 1.2692569494247437, "rewards/format_reward": 0.9921875, "step": 371 }, { "completion_length": 72.34375, "epoch": 4.481927710843373, "grad_norm": 3.81905493683615, "kl": 0.118408203125, "learning_rate": 2.5301204819277107e-07, "loss": 0.0047, "reward": 2.24534273147583, "reward_std": 0.2783522978425026, "rewards/accuracy_reward": 1.25315523147583, "rewards/format_reward": 0.9921875, "step": 372 }, { "completion_length": 73.625, "epoch": 4.493975903614458, "grad_norm": 5.859434170398068, "kl": 0.129638671875, "learning_rate": 2.5100401606425705e-07, "loss": 0.0052, "reward": 2.242166519165039, "reward_std": 0.19818732887506485, "rewards/accuracy_reward": 1.2421664595603943, "rewards/format_reward": 1.0, "step": 373 }, { "completion_length": 70.7734375, "epoch": 4.506024096385542, "grad_norm": 4.577359942879205, "kl": 0.113037109375, "learning_rate": 2.489959839357429e-07, "loss": 0.0045, "reward": 2.40807843208313, "reward_std": 0.16506175324320793, "rewards/accuracy_reward": 1.408078372478485, "rewards/format_reward": 1.0, "step": 374 }, { "completion_length": 71.6484375, "epoch": 4.518072289156627, "grad_norm": 3.6969886550918627, "kl": 0.0947265625, "learning_rate": 2.469879518072289e-07, "loss": 0.0038, "reward": 2.4090828895568848, "reward_std": 0.17872843891382217, "rewards/accuracy_reward": 1.4090829491615295, "rewards/format_reward": 1.0, "step": 375 }, { "completion_length": 75.640625, "epoch": 4.530120481927711, "grad_norm": 3.182069910394249, "kl": 0.112548828125, "learning_rate": 2.4497991967871483e-07, "loss": 0.0045, "reward": 2.429325222969055, "reward_std": 0.18355486541986465, "rewards/accuracy_reward": 1.4371376037597656, "rewards/format_reward": 0.9921875, "step": 376 }, { "completion_length": 76.8515625, "epoch": 4.542168674698795, "grad_norm": 4.3761923522139625, "kl": 0.103515625, "learning_rate": 2.429718875502008e-07, "loss": 0.0041, "reward": 2.215627670288086, "reward_std": 0.29024538397789, "rewards/accuracy_reward": 1.2234401106834412, "rewards/format_reward": 0.9921875, "step": 377 }, { "completion_length": 72.640625, "epoch": 4.554216867469879, "grad_norm": 5.739152465768093, "kl": 0.096923828125, "learning_rate": 2.4096385542168674e-07, "loss": 0.0039, "reward": 2.3864386081695557, "reward_std": 0.14991050213575363, "rewards/accuracy_reward": 1.3864384889602661, "rewards/format_reward": 1.0, "step": 378 }, { "completion_length": 73.7890625, "epoch": 4.566265060240964, "grad_norm": 4.330609617515541, "kl": 0.105712890625, "learning_rate": 2.3895582329317267e-07, "loss": 0.0042, "reward": 2.2676793336868286, "reward_std": 0.1841476932168007, "rewards/accuracy_reward": 1.2754917740821838, "rewards/format_reward": 0.9921875, "step": 379 }, { "completion_length": 69.5859375, "epoch": 4.578313253012048, "grad_norm": 16.70825245009543, "kl": 0.103515625, "learning_rate": 2.3694779116465862e-07, "loss": 0.0041, "reward": 2.3687047958374023, "reward_std": 0.23368250578641891, "rewards/accuracy_reward": 1.3765172958374023, "rewards/format_reward": 0.9921875, "step": 380 }, { "completion_length": 68.5703125, "epoch": 4.590361445783133, "grad_norm": 4.946973705468274, "kl": 0.11865234375, "learning_rate": 2.3493975903614457e-07, "loss": 0.0047, "reward": 2.409714102745056, "reward_std": 0.17494437843561172, "rewards/accuracy_reward": 1.4175265431404114, "rewards/format_reward": 0.9921875, "step": 381 }, { "completion_length": 69.09375, "epoch": 4.602409638554217, "grad_norm": 3.4407209788639155, "kl": 0.108154296875, "learning_rate": 2.3293172690763053e-07, "loss": 0.0043, "reward": 2.3722596168518066, "reward_std": 0.2456066906452179, "rewards/accuracy_reward": 1.3722596764564514, "rewards/format_reward": 1.0, "step": 382 }, { "completion_length": 73.40625, "epoch": 4.614457831325301, "grad_norm": 6.785057754949663, "kl": 0.093017578125, "learning_rate": 2.3092369477911648e-07, "loss": 0.0037, "reward": 2.390730619430542, "reward_std": 0.13034258037805557, "rewards/accuracy_reward": 1.390730619430542, "rewards/format_reward": 1.0, "step": 383 }, { "completion_length": 69.578125, "epoch": 4.626506024096385, "grad_norm": 4.146766679362004, "kl": 0.110107421875, "learning_rate": 2.2891566265060238e-07, "loss": 0.0044, "reward": 2.457837224006653, "reward_std": 0.19646844267845154, "rewards/accuracy_reward": 1.465649664402008, "rewards/format_reward": 0.9921875, "step": 384 }, { "completion_length": 71.4765625, "epoch": 4.63855421686747, "grad_norm": 3.5134218173180884, "kl": 0.10791015625, "learning_rate": 2.2690763052208834e-07, "loss": 0.0043, "reward": 2.2395870685577393, "reward_std": 0.23986083269119263, "rewards/accuracy_reward": 1.2630245089530945, "rewards/format_reward": 0.9765625, "step": 385 }, { "completion_length": 67.8984375, "epoch": 4.650602409638554, "grad_norm": 3.5532098801033323, "kl": 0.112060546875, "learning_rate": 2.248995983935743e-07, "loss": 0.0045, "reward": 2.155800759792328, "reward_std": 0.26599714159965515, "rewards/accuracy_reward": 1.1714258790016174, "rewards/format_reward": 0.984375, "step": 386 }, { "completion_length": 67.921875, "epoch": 4.662650602409639, "grad_norm": 3.977191337497143, "kl": 0.12353515625, "learning_rate": 2.2289156626506022e-07, "loss": 0.0049, "reward": 2.1573885679244995, "reward_std": 0.19674725830554962, "rewards/accuracy_reward": 1.165201187133789, "rewards/format_reward": 0.9921875, "step": 387 }, { "completion_length": 73.3671875, "epoch": 4.674698795180722, "grad_norm": 3.4384187805900894, "kl": 0.1005859375, "learning_rate": 2.2088353413654617e-07, "loss": 0.004, "reward": 2.238619089126587, "reward_std": 0.1663391888141632, "rewards/accuracy_reward": 1.2386190295219421, "rewards/format_reward": 1.0, "step": 388 }, { "completion_length": 71.3515625, "epoch": 4.686746987951807, "grad_norm": 3.6715987846617737, "kl": 0.1103515625, "learning_rate": 2.1887550200803212e-07, "loss": 0.0044, "reward": 2.2813053131103516, "reward_std": 0.20307840406894684, "rewards/accuracy_reward": 1.2891177535057068, "rewards/format_reward": 0.9921875, "step": 389 }, { "completion_length": 67.8671875, "epoch": 4.698795180722891, "grad_norm": 4.1990886176906566, "kl": 0.1181640625, "learning_rate": 2.1686746987951808e-07, "loss": 0.0047, "reward": 2.3316123485565186, "reward_std": 0.18899912387132645, "rewards/accuracy_reward": 1.339424967765808, "rewards/format_reward": 0.9921875, "step": 390 }, { "completion_length": 73.5390625, "epoch": 4.710843373493976, "grad_norm": 4.5848307121684035, "kl": 0.11767578125, "learning_rate": 2.14859437751004e-07, "loss": 0.0047, "reward": 2.3556346893310547, "reward_std": 0.17518161982297897, "rewards/accuracy_reward": 1.3634473085403442, "rewards/format_reward": 0.9921875, "step": 391 }, { "completion_length": 73.3828125, "epoch": 4.72289156626506, "grad_norm": 4.308895887462787, "kl": 0.09716796875, "learning_rate": 2.1285140562248996e-07, "loss": 0.0039, "reward": 2.3230199813842773, "reward_std": 0.2215501293540001, "rewards/accuracy_reward": 1.3230200409889221, "rewards/format_reward": 1.0, "step": 392 }, { "completion_length": 71.625, "epoch": 4.734939759036145, "grad_norm": 3.8869195849917335, "kl": 0.117919921875, "learning_rate": 2.108433734939759e-07, "loss": 0.0047, "reward": 2.311624765396118, "reward_std": 0.233637273311615, "rewards/accuracy_reward": 1.3116250038146973, "rewards/format_reward": 1.0, "step": 393 }, { "completion_length": 67.828125, "epoch": 4.746987951807229, "grad_norm": 4.950759054297939, "kl": 0.10888671875, "learning_rate": 2.0883534136546184e-07, "loss": 0.0044, "reward": 2.379747152328491, "reward_std": 0.19298578798770905, "rewards/accuracy_reward": 1.3797469735145569, "rewards/format_reward": 1.0, "step": 394 }, { "completion_length": 72.2578125, "epoch": 4.759036144578313, "grad_norm": 45.47765651174386, "kl": 0.126708984375, "learning_rate": 2.0682730923694776e-07, "loss": 0.0051, "reward": 2.078563928604126, "reward_std": 0.253988578915596, "rewards/accuracy_reward": 1.0941888689994812, "rewards/format_reward": 0.984375, "step": 395 }, { "completion_length": 71.6484375, "epoch": 4.771084337349397, "grad_norm": 6.044646695827286, "kl": 0.13916015625, "learning_rate": 2.0481927710843372e-07, "loss": 0.0056, "reward": 2.485829472541809, "reward_std": 0.180104598402977, "rewards/accuracy_reward": 1.4858292937278748, "rewards/format_reward": 1.0, "step": 396 }, { "completion_length": 65.09375, "epoch": 4.783132530120482, "grad_norm": 4.360820446081869, "kl": 0.1416015625, "learning_rate": 2.0281124497991967e-07, "loss": 0.0057, "reward": 2.1638635396957397, "reward_std": 0.31551285088062286, "rewards/accuracy_reward": 1.1873010993003845, "rewards/format_reward": 0.9765625, "step": 397 }, { "completion_length": 70.6328125, "epoch": 4.795180722891566, "grad_norm": 5.234619949658262, "kl": 0.115966796875, "learning_rate": 2.0080321285140563e-07, "loss": 0.0046, "reward": 2.424190402030945, "reward_std": 0.23157334327697754, "rewards/accuracy_reward": 1.4241904616355896, "rewards/format_reward": 1.0, "step": 398 }, { "completion_length": 70.4375, "epoch": 4.807228915662651, "grad_norm": 5.2543384630783265, "kl": 0.12060546875, "learning_rate": 1.9879518072289155e-07, "loss": 0.0048, "reward": 2.3333520889282227, "reward_std": 0.2145429253578186, "rewards/accuracy_reward": 1.3411647081375122, "rewards/format_reward": 0.9921875, "step": 399 }, { "completion_length": 65.421875, "epoch": 4.8192771084337345, "grad_norm": 6.050688926597152, "kl": 0.125732421875, "learning_rate": 1.967871485943775e-07, "loss": 0.005, "reward": 2.412783145904541, "reward_std": 0.2059781178832054, "rewards/accuracy_reward": 1.420595645904541, "rewards/format_reward": 0.9921875, "step": 400 }, { "completion_length": 63.5546875, "epoch": 4.831325301204819, "grad_norm": 4.14350718873446, "kl": 0.143798828125, "learning_rate": 1.9477911646586346e-07, "loss": 0.0057, "reward": 2.3667309284210205, "reward_std": 0.1764308363199234, "rewards/accuracy_reward": 1.3745434284210205, "rewards/format_reward": 0.9921875, "step": 401 }, { "completion_length": 71.8671875, "epoch": 4.843373493975903, "grad_norm": 4.134424932683493, "kl": 0.126953125, "learning_rate": 1.9277108433734939e-07, "loss": 0.0051, "reward": 2.2129541635513306, "reward_std": 0.1565767452120781, "rewards/accuracy_reward": 1.2129541635513306, "rewards/format_reward": 1.0, "step": 402 }, { "completion_length": 64.0390625, "epoch": 4.855421686746988, "grad_norm": 4.135875391105592, "kl": 0.166015625, "learning_rate": 1.9076305220883534e-07, "loss": 0.0066, "reward": 2.3259581327438354, "reward_std": 0.2349315583705902, "rewards/accuracy_reward": 1.3259583115577698, "rewards/format_reward": 1.0, "step": 403 }, { "completion_length": 66.515625, "epoch": 4.867469879518072, "grad_norm": 4.276605246406482, "kl": 0.138916015625, "learning_rate": 1.8875502008032127e-07, "loss": 0.0056, "reward": 2.306966781616211, "reward_std": 0.2081274688243866, "rewards/accuracy_reward": 1.3069666624069214, "rewards/format_reward": 1.0, "step": 404 }, { "completion_length": 62.28125, "epoch": 4.879518072289157, "grad_norm": 4.594134632277065, "kl": 0.1826171875, "learning_rate": 1.8674698795180722e-07, "loss": 0.0073, "reward": 2.126552700996399, "reward_std": 0.255823478102684, "rewards/accuracy_reward": 1.1421778202056885, "rewards/format_reward": 0.984375, "step": 405 }, { "completion_length": 62.3671875, "epoch": 4.891566265060241, "grad_norm": 3.568434088807843, "kl": 0.14013671875, "learning_rate": 1.8473895582329315e-07, "loss": 0.0056, "reward": 2.417848587036133, "reward_std": 0.22225632518529892, "rewards/accuracy_reward": 1.4334735870361328, "rewards/format_reward": 0.984375, "step": 406 }, { "completion_length": 66.5078125, "epoch": 4.903614457831325, "grad_norm": 4.123527789276523, "kl": 0.10986328125, "learning_rate": 1.827309236947791e-07, "loss": 0.0044, "reward": 2.294624924659729, "reward_std": 0.19924252480268478, "rewards/accuracy_reward": 1.3024373650550842, "rewards/format_reward": 0.9921875, "step": 407 }, { "completion_length": 66.390625, "epoch": 4.9156626506024095, "grad_norm": 3.62978164804241, "kl": 0.12890625, "learning_rate": 1.8072289156626505e-07, "loss": 0.0051, "reward": 2.543404698371887, "reward_std": 0.1362360306084156, "rewards/accuracy_reward": 1.5434046983718872, "rewards/format_reward": 1.0, "step": 408 }, { "completion_length": 63.9765625, "epoch": 4.927710843373494, "grad_norm": 4.35384844886202, "kl": 0.12890625, "learning_rate": 1.78714859437751e-07, "loss": 0.0052, "reward": 2.418124198913574, "reward_std": 0.22236012667417526, "rewards/accuracy_reward": 1.4337490797042847, "rewards/format_reward": 0.984375, "step": 409 }, { "completion_length": 68.90625, "epoch": 4.9397590361445785, "grad_norm": 5.014972518639089, "kl": 0.1103515625, "learning_rate": 1.7670682730923694e-07, "loss": 0.0044, "reward": 2.4006751775741577, "reward_std": 0.16714774072170258, "rewards/accuracy_reward": 1.4006752967834473, "rewards/format_reward": 1.0, "step": 410 }, { "completion_length": 69.59375, "epoch": 4.951807228915663, "grad_norm": 7.696032017895469, "kl": 0.13916015625, "learning_rate": 1.746987951807229e-07, "loss": 0.0056, "reward": 2.395194172859192, "reward_std": 0.16039493680000305, "rewards/accuracy_reward": 1.3951941132545471, "rewards/format_reward": 1.0, "step": 411 }, { "completion_length": 70.125, "epoch": 4.9638554216867465, "grad_norm": 4.628350833888434, "kl": 0.149169921875, "learning_rate": 1.7269076305220884e-07, "loss": 0.006, "reward": 2.1348607540130615, "reward_std": 0.1709538996219635, "rewards/accuracy_reward": 1.1348606944084167, "rewards/format_reward": 1.0, "step": 412 }, { "completion_length": 66.2109375, "epoch": 4.975903614457831, "grad_norm": 3.188607704812383, "kl": 0.12646484375, "learning_rate": 1.706827309236948e-07, "loss": 0.0051, "reward": 2.302504062652588, "reward_std": 0.2623682767152786, "rewards/accuracy_reward": 1.3181291222572327, "rewards/format_reward": 0.984375, "step": 413 }, { "completion_length": 64.171875, "epoch": 4.9879518072289155, "grad_norm": 3.9665667179390773, "kl": 0.128662109375, "learning_rate": 1.686746987951807e-07, "loss": 0.0052, "reward": 2.4097338914871216, "reward_std": 0.17293449118733406, "rewards/accuracy_reward": 1.4097338318824768, "rewards/format_reward": 1.0, "step": 414 }, { "completion_length": 77.33333587646484, "epoch": 5.0, "grad_norm": 3.313170759959086, "kl": 0.1083984375, "learning_rate": 1.6666666666666665e-07, "loss": 0.004, "reward": 2.2759520411491394, "reward_std": 0.1403224766254425, "rewards/accuracy_reward": 1.2759520411491394, "rewards/format_reward": 1.0, "step": 415 }, { "completion_length": 66.3203125, "epoch": 5.0120481927710845, "grad_norm": 4.277881132595083, "kl": 0.14306640625, "learning_rate": 1.646586345381526e-07, "loss": 0.0057, "reward": 2.373741865158081, "reward_std": 0.20744601637125015, "rewards/accuracy_reward": 1.3815542459487915, "rewards/format_reward": 0.9921875, "step": 416 }, { "completion_length": 66.53125, "epoch": 5.024096385542169, "grad_norm": 3.9929439696450575, "kl": 0.12939453125, "learning_rate": 1.6265060240963853e-07, "loss": 0.0052, "reward": 2.35166335105896, "reward_std": 0.2503097951412201, "rewards/accuracy_reward": 1.35166335105896, "rewards/format_reward": 1.0, "step": 417 }, { "completion_length": 68.625, "epoch": 5.036144578313253, "grad_norm": 4.023924792103433, "kl": 0.114013671875, "learning_rate": 1.6064257028112448e-07, "loss": 0.0046, "reward": 2.2476612329483032, "reward_std": 0.185993991792202, "rewards/accuracy_reward": 1.2554737329483032, "rewards/format_reward": 0.9921875, "step": 418 }, { "completion_length": 65.7421875, "epoch": 5.048192771084337, "grad_norm": 3.5711137415239618, "kl": 0.134033203125, "learning_rate": 1.5863453815261044e-07, "loss": 0.0054, "reward": 2.2856324911117554, "reward_std": 0.14102690666913986, "rewards/accuracy_reward": 1.2856324911117554, "rewards/format_reward": 1.0, "step": 419 }, { "completion_length": 65.1328125, "epoch": 5.0602409638554215, "grad_norm": 5.8881280705003505, "kl": 0.1259765625, "learning_rate": 1.566265060240964e-07, "loss": 0.005, "reward": 2.474275588989258, "reward_std": 0.2030300498008728, "rewards/accuracy_reward": 1.474275529384613, "rewards/format_reward": 1.0, "step": 420 }, { "completion_length": 59.453125, "epoch": 5.072289156626506, "grad_norm": 17.487945694806488, "kl": 0.1279296875, "learning_rate": 1.5461847389558232e-07, "loss": 0.0051, "reward": 2.468233823776245, "reward_std": 0.17333931475877762, "rewards/accuracy_reward": 1.4682338237762451, "rewards/format_reward": 1.0, "step": 421 }, { "completion_length": 67.7421875, "epoch": 5.0843373493975905, "grad_norm": 4.5642738703913865, "kl": 0.12646484375, "learning_rate": 1.5261044176706827e-07, "loss": 0.0051, "reward": 2.39510977268219, "reward_std": 0.1837218478322029, "rewards/accuracy_reward": 1.3951098918914795, "rewards/format_reward": 1.0, "step": 422 }, { "completion_length": 64.515625, "epoch": 5.096385542168675, "grad_norm": 7.684070732359071, "kl": 0.139892578125, "learning_rate": 1.5060240963855423e-07, "loss": 0.0056, "reward": 2.16294264793396, "reward_std": 0.14895135164260864, "rewards/accuracy_reward": 1.1707550883293152, "rewards/format_reward": 0.9921875, "step": 423 }, { "completion_length": 64.46875, "epoch": 5.108433734939759, "grad_norm": 3.930344733874979, "kl": 0.11669921875, "learning_rate": 1.4859437751004015e-07, "loss": 0.0047, "reward": 2.3980486392974854, "reward_std": 0.15896277129650116, "rewards/accuracy_reward": 1.3980485796928406, "rewards/format_reward": 1.0, "step": 424 }, { "completion_length": 68.875, "epoch": 5.120481927710843, "grad_norm": 6.912033255857147, "kl": 0.118896484375, "learning_rate": 1.4658634538152608e-07, "loss": 0.0048, "reward": 2.4401201009750366, "reward_std": 0.18969366699457169, "rewards/accuracy_reward": 1.440119981765747, "rewards/format_reward": 1.0, "step": 425 }, { "completion_length": 65.609375, "epoch": 5.132530120481928, "grad_norm": 3.6477005267341163, "kl": 0.1708984375, "learning_rate": 1.4457831325301203e-07, "loss": 0.0068, "reward": 2.300011992454529, "reward_std": 0.2104162722826004, "rewards/accuracy_reward": 1.300011932849884, "rewards/format_reward": 1.0, "step": 426 }, { "completion_length": 65.0859375, "epoch": 5.144578313253012, "grad_norm": 5.390081007205584, "kl": 0.12548828125, "learning_rate": 1.42570281124498e-07, "loss": 0.005, "reward": 2.407547354698181, "reward_std": 0.19479839503765106, "rewards/accuracy_reward": 1.4075472354888916, "rewards/format_reward": 1.0, "step": 427 }, { "completion_length": 65.8046875, "epoch": 5.156626506024097, "grad_norm": 5.842696773596783, "kl": 0.12255859375, "learning_rate": 1.4056224899598394e-07, "loss": 0.0049, "reward": 2.2872836589813232, "reward_std": 0.2501709461212158, "rewards/accuracy_reward": 1.2950963973999023, "rewards/format_reward": 0.9921875, "step": 428 }, { "completion_length": 67.2890625, "epoch": 5.168674698795181, "grad_norm": 3.9373211288360612, "kl": 0.134765625, "learning_rate": 1.3855421686746987e-07, "loss": 0.0054, "reward": 2.4114162921905518, "reward_std": 0.22173649817705154, "rewards/accuracy_reward": 1.419228732585907, "rewards/format_reward": 0.9921875, "step": 429 }, { "completion_length": 65.7265625, "epoch": 5.180722891566265, "grad_norm": 5.989728831260378, "kl": 0.20263671875, "learning_rate": 1.3654618473895582e-07, "loss": 0.0081, "reward": 2.349661111831665, "reward_std": 0.24485966563224792, "rewards/accuracy_reward": 1.3496609926223755, "rewards/format_reward": 1.0, "step": 430 }, { "completion_length": 71.0390625, "epoch": 5.192771084337349, "grad_norm": 4.9722233041190425, "kl": 0.11083984375, "learning_rate": 1.3453815261044177e-07, "loss": 0.0044, "reward": 2.423168659210205, "reward_std": 0.16536322236061096, "rewards/accuracy_reward": 1.4231685996055603, "rewards/format_reward": 1.0, "step": 431 }, { "completion_length": 66.234375, "epoch": 5.204819277108434, "grad_norm": 3.5058259130400162, "kl": 0.1376953125, "learning_rate": 1.3253012048192773e-07, "loss": 0.0055, "reward": 2.2352651357650757, "reward_std": 0.18688317388296127, "rewards/accuracy_reward": 1.2352651357650757, "rewards/format_reward": 1.0, "step": 432 }, { "completion_length": 72.8203125, "epoch": 5.216867469879518, "grad_norm": 3.8748331360003485, "kl": 0.130859375, "learning_rate": 1.3052208835341366e-07, "loss": 0.0052, "reward": 2.3151748180389404, "reward_std": 0.21110112965106964, "rewards/accuracy_reward": 1.3229871988296509, "rewards/format_reward": 0.9921875, "step": 433 }, { "completion_length": 68.8671875, "epoch": 5.228915662650603, "grad_norm": 3.985332448415374, "kl": 0.1220703125, "learning_rate": 1.2851405622489958e-07, "loss": 0.0049, "reward": 2.26615047454834, "reward_std": 0.20259422063827515, "rewards/accuracy_reward": 1.2739630937576294, "rewards/format_reward": 0.9921875, "step": 434 }, { "completion_length": 64.0234375, "epoch": 5.240963855421687, "grad_norm": 4.209088113123041, "kl": 0.119873046875, "learning_rate": 1.2650602409638554e-07, "loss": 0.0048, "reward": 2.345677137374878, "reward_std": 0.16655350476503372, "rewards/accuracy_reward": 1.345677137374878, "rewards/format_reward": 1.0, "step": 435 }, { "completion_length": 72.2109375, "epoch": 5.253012048192771, "grad_norm": 3.7180924645581994, "kl": 0.13427734375, "learning_rate": 1.2449799196787146e-07, "loss": 0.0054, "reward": 2.163213849067688, "reward_std": 0.3149610310792923, "rewards/accuracy_reward": 1.1866515278816223, "rewards/format_reward": 0.9765625, "step": 436 }, { "completion_length": 65.328125, "epoch": 5.265060240963855, "grad_norm": 3.8280472693841556, "kl": 0.12744140625, "learning_rate": 1.2248995983935742e-07, "loss": 0.0051, "reward": 2.3446794748306274, "reward_std": 0.22430174052715302, "rewards/accuracy_reward": 1.3446794152259827, "rewards/format_reward": 1.0, "step": 437 }, { "completion_length": 64.65625, "epoch": 5.27710843373494, "grad_norm": 5.861122122648032, "kl": 0.12060546875, "learning_rate": 1.2048192771084337e-07, "loss": 0.0048, "reward": 2.379356861114502, "reward_std": 0.1506607085466385, "rewards/accuracy_reward": 1.3871691226959229, "rewards/format_reward": 0.9921875, "step": 438 }, { "completion_length": 71.1171875, "epoch": 5.289156626506024, "grad_norm": 3.8119653679452092, "kl": 0.12353515625, "learning_rate": 1.1847389558232931e-07, "loss": 0.0049, "reward": 2.388357400894165, "reward_std": 0.23687779903411865, "rewards/accuracy_reward": 1.3961697816848755, "rewards/format_reward": 0.9921875, "step": 439 }, { "completion_length": 72.3515625, "epoch": 5.301204819277109, "grad_norm": 3.9178115284886372, "kl": 0.095458984375, "learning_rate": 1.1646586345381526e-07, "loss": 0.0038, "reward": 2.6513583660125732, "reward_std": 0.17830242216587067, "rewards/accuracy_reward": 1.6513583660125732, "rewards/format_reward": 1.0, "step": 440 }, { "completion_length": 68.921875, "epoch": 5.313253012048193, "grad_norm": 4.623442869387058, "kl": 0.100830078125, "learning_rate": 1.1445783132530119e-07, "loss": 0.004, "reward": 2.549654483795166, "reward_std": 0.16079290956258774, "rewards/accuracy_reward": 1.5574671030044556, "rewards/format_reward": 0.9921875, "step": 441 }, { "completion_length": 71.3203125, "epoch": 5.325301204819277, "grad_norm": 5.278895722638805, "kl": 0.10986328125, "learning_rate": 1.1244979919678714e-07, "loss": 0.0044, "reward": 2.203883409500122, "reward_std": 0.258064404129982, "rewards/accuracy_reward": 1.2116957902908325, "rewards/format_reward": 0.9921875, "step": 442 }, { "completion_length": 69.515625, "epoch": 5.337349397590361, "grad_norm": 4.142710717599773, "kl": 0.113525390625, "learning_rate": 1.1044176706827308e-07, "loss": 0.0045, "reward": 2.1769516468048096, "reward_std": 0.275626465678215, "rewards/accuracy_reward": 1.1769516468048096, "rewards/format_reward": 1.0, "step": 443 }, { "completion_length": 68.3203125, "epoch": 5.349397590361446, "grad_norm": 4.180078412016221, "kl": 0.147216796875, "learning_rate": 1.0843373493975904e-07, "loss": 0.0059, "reward": 2.381720542907715, "reward_std": 0.20287376642227173, "rewards/accuracy_reward": 1.3817205429077148, "rewards/format_reward": 1.0, "step": 444 }, { "completion_length": 69.7421875, "epoch": 5.36144578313253, "grad_norm": 3.7523897150785603, "kl": 0.12939453125, "learning_rate": 1.0642570281124498e-07, "loss": 0.0052, "reward": 2.3669261932373047, "reward_std": 0.2056456208229065, "rewards/accuracy_reward": 1.3747385740280151, "rewards/format_reward": 0.9921875, "step": 445 }, { "completion_length": 67.7109375, "epoch": 5.373493975903615, "grad_norm": 4.924758819089559, "kl": 0.185546875, "learning_rate": 1.0441767068273092e-07, "loss": 0.0074, "reward": 2.4100332260131836, "reward_std": 0.22913093864917755, "rewards/accuracy_reward": 1.4178457260131836, "rewards/format_reward": 0.9921875, "step": 446 }, { "completion_length": 69.1875, "epoch": 5.385542168674699, "grad_norm": 3.080626056952063, "kl": 0.122314453125, "learning_rate": 1.0240963855421686e-07, "loss": 0.0049, "reward": 2.3073067665100098, "reward_std": 0.23586007952690125, "rewards/accuracy_reward": 1.315119206905365, "rewards/format_reward": 0.9921875, "step": 447 }, { "completion_length": 67.59375, "epoch": 5.397590361445783, "grad_norm": 3.8573400804993314, "kl": 0.128662109375, "learning_rate": 1.0040160642570281e-07, "loss": 0.0051, "reward": 2.2195699214935303, "reward_std": 0.18059836328029633, "rewards/accuracy_reward": 1.2195698618888855, "rewards/format_reward": 1.0, "step": 448 }, { "completion_length": 65.0078125, "epoch": 5.409638554216867, "grad_norm": 9.729377045307634, "kl": 0.110107421875, "learning_rate": 9.839357429718875e-08, "loss": 0.0044, "reward": 2.335146427154541, "reward_std": 0.20962534099817276, "rewards/accuracy_reward": 1.3429590463638306, "rewards/format_reward": 0.9921875, "step": 449 }, { "completion_length": 76.171875, "epoch": 5.421686746987952, "grad_norm": 5.139417091846479, "kl": 0.17626953125, "learning_rate": 9.638554216867469e-08, "loss": 0.0071, "reward": 2.2514326572418213, "reward_std": 0.18450473248958588, "rewards/accuracy_reward": 1.2592450976371765, "rewards/format_reward": 0.9921875, "step": 450 }, { "completion_length": 68.046875, "epoch": 5.433734939759036, "grad_norm": 3.961385062957452, "kl": 0.10693359375, "learning_rate": 9.437751004016063e-08, "loss": 0.0043, "reward": 2.328533172607422, "reward_std": 0.18290965259075165, "rewards/accuracy_reward": 1.3285331726074219, "rewards/format_reward": 1.0, "step": 451 }, { "completion_length": 68.6953125, "epoch": 5.445783132530121, "grad_norm": 4.887519681333338, "kl": 0.103759765625, "learning_rate": 9.236947791164657e-08, "loss": 0.0042, "reward": 2.3144426345825195, "reward_std": 0.21034369617700577, "rewards/accuracy_reward": 1.3144426941871643, "rewards/format_reward": 1.0, "step": 452 }, { "completion_length": 68.0, "epoch": 5.457831325301205, "grad_norm": 3.80893967356862, "kl": 0.127685546875, "learning_rate": 9.036144578313253e-08, "loss": 0.0051, "reward": 2.4345412254333496, "reward_std": 0.2006332352757454, "rewards/accuracy_reward": 1.4345412254333496, "rewards/format_reward": 1.0, "step": 453 }, { "completion_length": 67.046875, "epoch": 5.469879518072289, "grad_norm": 4.2954066473287815, "kl": 0.12841796875, "learning_rate": 8.835341365461847e-08, "loss": 0.0052, "reward": 2.353352427482605, "reward_std": 0.22566306591033936, "rewards/accuracy_reward": 1.353352427482605, "rewards/format_reward": 1.0, "step": 454 }, { "completion_length": 64.8984375, "epoch": 5.481927710843373, "grad_norm": 4.546803918905019, "kl": 0.1337890625, "learning_rate": 8.634538152610442e-08, "loss": 0.0054, "reward": 2.3113902807235718, "reward_std": 0.20004340261220932, "rewards/accuracy_reward": 1.3192027807235718, "rewards/format_reward": 0.9921875, "step": 455 }, { "completion_length": 66.1640625, "epoch": 5.493975903614458, "grad_norm": 3.5466190382737883, "kl": 0.123046875, "learning_rate": 8.433734939759035e-08, "loss": 0.0049, "reward": 2.3270002603530884, "reward_std": 0.21506989747285843, "rewards/accuracy_reward": 1.3270001411437988, "rewards/format_reward": 1.0, "step": 456 }, { "completion_length": 72.3984375, "epoch": 5.506024096385542, "grad_norm": 5.213818604387868, "kl": 0.1328125, "learning_rate": 8.23293172690763e-08, "loss": 0.0053, "reward": 2.4117329120635986, "reward_std": 0.21075783669948578, "rewards/accuracy_reward": 1.411732792854309, "rewards/format_reward": 1.0, "step": 457 }, { "completion_length": 63.4140625, "epoch": 5.518072289156627, "grad_norm": 4.087135154378612, "kl": 0.1142578125, "learning_rate": 8.032128514056224e-08, "loss": 0.0046, "reward": 2.2361518144607544, "reward_std": 0.15534771978855133, "rewards/accuracy_reward": 1.2361518740653992, "rewards/format_reward": 1.0, "step": 458 }, { "completion_length": 66.6796875, "epoch": 5.530120481927711, "grad_norm": 3.8509871084036083, "kl": 0.12255859375, "learning_rate": 7.83132530120482e-08, "loss": 0.0049, "reward": 2.402904510498047, "reward_std": 0.18761365860700607, "rewards/accuracy_reward": 1.4029043912887573, "rewards/format_reward": 1.0, "step": 459 }, { "completion_length": 67.921875, "epoch": 5.542168674698795, "grad_norm": 3.8868143152174714, "kl": 0.1201171875, "learning_rate": 7.630522088353414e-08, "loss": 0.0048, "reward": 2.202209234237671, "reward_std": 0.20886321365833282, "rewards/accuracy_reward": 1.2022093534469604, "rewards/format_reward": 1.0, "step": 460 }, { "completion_length": 69.84375, "epoch": 5.554216867469879, "grad_norm": 9.828452094441177, "kl": 0.138427734375, "learning_rate": 7.429718875502008e-08, "loss": 0.0055, "reward": 2.255289673805237, "reward_std": 0.3091956526041031, "rewards/accuracy_reward": 1.2787271738052368, "rewards/format_reward": 0.9765625, "step": 461 }, { "completion_length": 67.7265625, "epoch": 5.566265060240964, "grad_norm": 3.5884325923981777, "kl": 0.14501953125, "learning_rate": 7.228915662650602e-08, "loss": 0.0058, "reward": 2.389763116836548, "reward_std": 0.1989041194319725, "rewards/accuracy_reward": 1.3897631168365479, "rewards/format_reward": 1.0, "step": 462 }, { "completion_length": 63.4765625, "epoch": 5.578313253012048, "grad_norm": 3.943165256338966, "kl": 0.15185546875, "learning_rate": 7.028112449799197e-08, "loss": 0.0061, "reward": 2.2263519763946533, "reward_std": 0.22419632971286774, "rewards/accuracy_reward": 1.2341644763946533, "rewards/format_reward": 0.9921875, "step": 463 }, { "completion_length": 67.734375, "epoch": 5.590361445783133, "grad_norm": 8.892123036444877, "kl": 0.126953125, "learning_rate": 6.827309236947791e-08, "loss": 0.0051, "reward": 2.3126423358917236, "reward_std": 0.17722339183092117, "rewards/accuracy_reward": 1.3126422762870789, "rewards/format_reward": 1.0, "step": 464 }, { "completion_length": 75.5546875, "epoch": 5.602409638554217, "grad_norm": 4.229071556328315, "kl": 0.1240234375, "learning_rate": 6.626506024096386e-08, "loss": 0.005, "reward": 2.2280049324035645, "reward_std": 0.22474994510412216, "rewards/accuracy_reward": 1.235817551612854, "rewards/format_reward": 0.9921875, "step": 465 }, { "completion_length": 66.9609375, "epoch": 5.614457831325301, "grad_norm": 4.577684554062664, "kl": 0.12451171875, "learning_rate": 6.425702811244979e-08, "loss": 0.005, "reward": 2.2235909700393677, "reward_std": 0.22441789507865906, "rewards/accuracy_reward": 1.2392158508300781, "rewards/format_reward": 0.984375, "step": 466 }, { "completion_length": 70.4375, "epoch": 5.626506024096385, "grad_norm": 4.349159327486559, "kl": 0.112548828125, "learning_rate": 6.224899598393573e-08, "loss": 0.0045, "reward": 2.3591808080673218, "reward_std": 0.1966349333524704, "rewards/accuracy_reward": 1.3669933080673218, "rewards/format_reward": 0.9921875, "step": 467 }, { "completion_length": 69.4453125, "epoch": 5.63855421686747, "grad_norm": 3.0423100870405437, "kl": 0.138671875, "learning_rate": 6.024096385542168e-08, "loss": 0.0055, "reward": 2.4168301820755005, "reward_std": 0.23313428461551666, "rewards/accuracy_reward": 1.4246427416801453, "rewards/format_reward": 0.9921875, "step": 468 }, { "completion_length": 67.9453125, "epoch": 5.650602409638554, "grad_norm": 4.8492295392656075, "kl": 0.124755859375, "learning_rate": 5.823293172690763e-08, "loss": 0.005, "reward": 2.3264076709747314, "reward_std": 0.18676774948835373, "rewards/accuracy_reward": 1.3264076709747314, "rewards/format_reward": 1.0, "step": 469 }, { "completion_length": 68.3984375, "epoch": 5.662650602409639, "grad_norm": 3.7143887896006706, "kl": 0.118896484375, "learning_rate": 5.622489959839357e-08, "loss": 0.0048, "reward": 2.275146722793579, "reward_std": 0.23441863059997559, "rewards/accuracy_reward": 1.2907716631889343, "rewards/format_reward": 0.984375, "step": 470 }, { "completion_length": 69.703125, "epoch": 5.674698795180722, "grad_norm": 6.421818895030251, "kl": 0.105712890625, "learning_rate": 5.421686746987952e-08, "loss": 0.0042, "reward": 2.3713172674179077, "reward_std": 0.17046835273504257, "rewards/accuracy_reward": 1.3713172674179077, "rewards/format_reward": 1.0, "step": 471 }, { "completion_length": 71.7578125, "epoch": 5.686746987951807, "grad_norm": 3.7429303333646846, "kl": 0.17333984375, "learning_rate": 5.220883534136546e-08, "loss": 0.0069, "reward": 2.21248197555542, "reward_std": 0.1897253841161728, "rewards/accuracy_reward": 1.2202943563461304, "rewards/format_reward": 0.9921875, "step": 472 }, { "completion_length": 66.0625, "epoch": 5.698795180722891, "grad_norm": 4.6125292648898375, "kl": 0.1171875, "learning_rate": 5.0200803212851406e-08, "loss": 0.0047, "reward": 2.3862085342407227, "reward_std": 0.14106625318527222, "rewards/accuracy_reward": 1.3940210938453674, "rewards/format_reward": 0.9921875, "step": 473 }, { "completion_length": 71.4296875, "epoch": 5.710843373493976, "grad_norm": 4.192704287374918, "kl": 0.108642578125, "learning_rate": 4.8192771084337347e-08, "loss": 0.0043, "reward": 2.3476767539978027, "reward_std": 0.20362288504838943, "rewards/accuracy_reward": 1.3476767539978027, "rewards/format_reward": 1.0, "step": 474 }, { "completion_length": 67.2109375, "epoch": 5.72289156626506, "grad_norm": 4.1447657242460645, "kl": 0.1298828125, "learning_rate": 4.618473895582329e-08, "loss": 0.0052, "reward": 2.266420602798462, "reward_std": 0.2129717692732811, "rewards/accuracy_reward": 1.2664207220077515, "rewards/format_reward": 1.0, "step": 475 }, { "completion_length": 66.546875, "epoch": 5.734939759036145, "grad_norm": 3.4345215566799574, "kl": 0.106201171875, "learning_rate": 4.4176706827309234e-08, "loss": 0.0042, "reward": 2.352730870246887, "reward_std": 0.1454787813127041, "rewards/accuracy_reward": 1.3605434894561768, "rewards/format_reward": 0.9921875, "step": 476 }, { "completion_length": 71.828125, "epoch": 5.746987951807229, "grad_norm": 4.187659893839478, "kl": 0.111328125, "learning_rate": 4.2168674698795174e-08, "loss": 0.0045, "reward": 2.2670211791992188, "reward_std": 0.22116923332214355, "rewards/accuracy_reward": 1.267021119594574, "rewards/format_reward": 1.0, "step": 477 }, { "completion_length": 69.1875, "epoch": 5.759036144578313, "grad_norm": 3.8623536023281617, "kl": 0.114013671875, "learning_rate": 4.016064257028112e-08, "loss": 0.0046, "reward": 2.222132921218872, "reward_std": 0.23479964584112167, "rewards/accuracy_reward": 1.2221328020095825, "rewards/format_reward": 1.0, "step": 478 }, { "completion_length": 70.9296875, "epoch": 5.771084337349397, "grad_norm": 4.262446208684037, "kl": 0.09375, "learning_rate": 3.815261044176707e-08, "loss": 0.0037, "reward": 2.2334243059158325, "reward_std": 0.21778832376003265, "rewards/accuracy_reward": 1.2334243059158325, "rewards/format_reward": 1.0, "step": 479 }, { "completion_length": 68.2421875, "epoch": 5.783132530120482, "grad_norm": 3.475197673617196, "kl": 0.10595703125, "learning_rate": 3.614457831325301e-08, "loss": 0.0042, "reward": 2.4461944103240967, "reward_std": 0.21106188744306564, "rewards/accuracy_reward": 1.4540069103240967, "rewards/format_reward": 0.9921875, "step": 480 }, { "completion_length": 70.3671875, "epoch": 5.795180722891566, "grad_norm": 4.56883704942929, "kl": 0.11865234375, "learning_rate": 3.4136546184738955e-08, "loss": 0.0047, "reward": 2.441108226776123, "reward_std": 0.2091435343027115, "rewards/accuracy_reward": 1.441108226776123, "rewards/format_reward": 1.0, "step": 481 }, { "completion_length": 69.171875, "epoch": 5.807228915662651, "grad_norm": 3.959761896565078, "kl": 0.12451171875, "learning_rate": 3.2128514056224896e-08, "loss": 0.005, "reward": 2.3847368955612183, "reward_std": 0.14646587148308754, "rewards/accuracy_reward": 1.3847368359565735, "rewards/format_reward": 1.0, "step": 482 }, { "completion_length": 75.3125, "epoch": 5.8192771084337345, "grad_norm": 4.6238410926161855, "kl": 0.108642578125, "learning_rate": 3.012048192771084e-08, "loss": 0.0043, "reward": 2.2356351613998413, "reward_std": 0.3032216280698776, "rewards/accuracy_reward": 1.2434476613998413, "rewards/format_reward": 0.9921875, "step": 483 }, { "completion_length": 70.921875, "epoch": 5.831325301204819, "grad_norm": 4.963499305554948, "kl": 0.082275390625, "learning_rate": 2.8112449799196786e-08, "loss": 0.0033, "reward": 2.3230150938034058, "reward_std": 0.16892920434474945, "rewards/accuracy_reward": 1.3230149745941162, "rewards/format_reward": 1.0, "step": 484 }, { "completion_length": 69.3359375, "epoch": 5.843373493975903, "grad_norm": 4.069771837808966, "kl": 0.1396484375, "learning_rate": 2.610441767068273e-08, "loss": 0.0056, "reward": 2.327863335609436, "reward_std": 0.23238816112279892, "rewards/accuracy_reward": 1.3434883952140808, "rewards/format_reward": 0.984375, "step": 485 }, { "completion_length": 68.875, "epoch": 5.855421686746988, "grad_norm": 4.471391988945464, "kl": 0.13330078125, "learning_rate": 2.4096385542168673e-08, "loss": 0.0053, "reward": 2.331111192703247, "reward_std": 0.1987084299325943, "rewards/accuracy_reward": 1.3389237523078918, "rewards/format_reward": 0.9921875, "step": 486 }, { "completion_length": 72.2734375, "epoch": 5.867469879518072, "grad_norm": 4.3661266337784514, "kl": 0.128173828125, "learning_rate": 2.2088353413654617e-08, "loss": 0.0051, "reward": 2.2740135192871094, "reward_std": 0.17679665982723236, "rewards/accuracy_reward": 1.2740132808685303, "rewards/format_reward": 1.0, "step": 487 }, { "completion_length": 69.328125, "epoch": 5.879518072289157, "grad_norm": 4.78815312664634, "kl": 0.150634765625, "learning_rate": 2.008032128514056e-08, "loss": 0.006, "reward": 2.2422866821289062, "reward_std": 0.23693696409463882, "rewards/accuracy_reward": 1.2422866821289062, "rewards/format_reward": 1.0, "step": 488 }, { "completion_length": 71.4140625, "epoch": 5.891566265060241, "grad_norm": 6.245102077972556, "kl": 0.121826171875, "learning_rate": 1.8072289156626504e-08, "loss": 0.0049, "reward": 2.315194010734558, "reward_std": 0.1885218769311905, "rewards/accuracy_reward": 1.3230066299438477, "rewards/format_reward": 0.9921875, "step": 489 }, { "completion_length": 63.8984375, "epoch": 5.903614457831325, "grad_norm": 4.510763484461414, "kl": 0.122314453125, "learning_rate": 1.6064257028112448e-08, "loss": 0.0049, "reward": 2.3149102926254272, "reward_std": 0.1639706939458847, "rewards/accuracy_reward": 1.3149102926254272, "rewards/format_reward": 1.0, "step": 490 }, { "completion_length": 66.0, "epoch": 5.9156626506024095, "grad_norm": 4.091329557372317, "kl": 0.1435546875, "learning_rate": 1.4056224899598393e-08, "loss": 0.0058, "reward": 2.4370064735412598, "reward_std": 0.15971215814352036, "rewards/accuracy_reward": 1.4370064735412598, "rewards/format_reward": 1.0, "step": 491 }, { "completion_length": 70.484375, "epoch": 5.927710843373494, "grad_norm": 4.3856574896033305, "kl": 0.155029296875, "learning_rate": 1.2048192771084337e-08, "loss": 0.0062, "reward": 2.351839542388916, "reward_std": 0.2616487815976143, "rewards/accuracy_reward": 1.359652042388916, "rewards/format_reward": 0.9921875, "step": 492 }, { "completion_length": 74.171875, "epoch": 5.9397590361445785, "grad_norm": 3.3373281083458974, "kl": 0.107177734375, "learning_rate": 1.004016064257028e-08, "loss": 0.0043, "reward": 2.3034894466400146, "reward_std": 0.12144535779953003, "rewards/accuracy_reward": 1.3113019466400146, "rewards/format_reward": 0.9921875, "step": 493 }, { "completion_length": 72.8515625, "epoch": 5.951807228915663, "grad_norm": 3.3157754210190773, "kl": 0.097412109375, "learning_rate": 8.032128514056224e-09, "loss": 0.0039, "reward": 2.421133041381836, "reward_std": 0.16620434820652008, "rewards/accuracy_reward": 1.421133041381836, "rewards/format_reward": 1.0, "step": 494 }, { "completion_length": 76.1328125, "epoch": 5.9638554216867465, "grad_norm": 3.788575194538334, "kl": 0.12158203125, "learning_rate": 6.024096385542168e-09, "loss": 0.0049, "reward": 2.3588104248046875, "reward_std": 0.1766229048371315, "rewards/accuracy_reward": 1.358810544013977, "rewards/format_reward": 1.0, "step": 495 }, { "completion_length": 71.515625, "epoch": 5.975903614457831, "grad_norm": 4.2730966058785835, "kl": 0.11962890625, "learning_rate": 4.016064257028112e-09, "loss": 0.0048, "reward": 2.3155951499938965, "reward_std": 0.25304850190877914, "rewards/accuracy_reward": 1.3234076499938965, "rewards/format_reward": 0.9921875, "step": 496 }, { "completion_length": 68.859375, "epoch": 5.9879518072289155, "grad_norm": 4.371956801820215, "kl": 0.119140625, "learning_rate": 2.008032128514056e-09, "loss": 0.0048, "reward": 2.3737374544143677, "reward_std": 0.20605729520320892, "rewards/accuracy_reward": 1.373737394809723, "rewards/format_reward": 1.0, "step": 497 }, { "completion_length": 60.75000190734863, "epoch": 6.0, "grad_norm": 3.9720317304626964, "kl": 0.1171875, "learning_rate": 0.0, "loss": 0.0046, "reward": 2.4247955083847046, "reward_std": 0.17968511581420898, "rewards/accuracy_reward": 1.4247953295707703, "rewards/format_reward": 1.0, "step": 498 } ], "logging_steps": 1.0, "max_steps": 498, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }