{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.944, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 450.06251525878906, "epoch": 0.016, "grad_norm": 0.8495607523800367, "kl": 0.0, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "reward": 0.502269446849823, "reward_std": 0.42649583518505096, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.1064360924065113, "step": 1 }, { "completion_length": 507.75001525878906, "epoch": 0.032, "grad_norm": 1.371013897533057, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": -0.0, "reward": 0.31612901389598846, "reward_std": 0.5700598657131195, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.045295679941773415, "step": 2 }, { "completion_length": 493.93751525878906, "epoch": 0.048, "grad_norm": 0.5754588380314621, "kl": 3.3274292945861816e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.44113826751708984, "reward_std": 0.5670813024044037, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.06613820930942893, "step": 3 }, { "completion_length": 530.4583435058594, "epoch": 0.064, "grad_norm": 1.6421888296529672, "kl": -2.0503997802734375e-05, "learning_rate": 1.3333333333333334e-07, "loss": -0.0, "reward": 0.16917918622493744, "reward_std": 0.43559131026268005, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.044179188553243876, "step": 4 }, { "completion_length": 405.43751525878906, "epoch": 0.08, "grad_norm": 2.4469782065215795, "kl": 6.712973117828369e-06, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 0.21715006977319717, "reward_std": 0.3707175552845001, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.17548342049121857, "step": 5 }, { "completion_length": 520.6041717529297, "epoch": 0.096, "grad_norm": 0.46306313754526307, "kl": -2.2113323211669922e-05, "learning_rate": 2e-07, "loss": -0.0, "reward": 0.37700220942497253, "reward_std": 0.5394288003444672, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.02283555455505848, "step": 6 }, { "completion_length": 451.5, "epoch": 0.112, "grad_norm": 2.609645604720621, "kl": -1.2526754289865494e-05, "learning_rate": 2.3333333333333333e-07, "loss": -0.0, "reward": 0.20611736923456192, "reward_std": 0.37926262617111206, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.12278405949473381, "step": 7 }, { "completion_length": 471.4375, "epoch": 0.128, "grad_norm": 0.4804918460566338, "kl": -2.968311309814453e-05, "learning_rate": 2.6666666666666667e-07, "loss": -0.0, "reward": 0.38565604388713837, "reward_std": 0.4431494176387787, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.0939893783070147, "step": 8 }, { "completion_length": 463.22918701171875, "epoch": 0.144, "grad_norm": 0.3372987851073737, "kl": 6.516464054584503e-06, "learning_rate": 3e-07, "loss": -0.0, "reward": 0.44847334921360016, "reward_std": 0.5335665941238403, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.11514000222086906, "step": 9 }, { "completion_length": 361.62501525878906, "epoch": 0.16, "grad_norm": 0.7532184792335336, "kl": 1.1920928955078125e-06, "learning_rate": 3.333333333333333e-07, "loss": -0.0, "reward": 0.456779420375824, "reward_std": 0.5467902272939682, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.18594606965780258, "step": 10 }, { "completion_length": 473.2083435058594, "epoch": 0.176, "grad_norm": 0.5536327901950325, "kl": -1.7859041690826416e-05, "learning_rate": 3.666666666666666e-07, "loss": -0.0, "reward": 0.29044337570667267, "reward_std": 0.4437016546726227, "rewards/correct_code_reward_func": 0.1875, "rewards/len_reward_func": 0.10294336080551147, "step": 11 }, { "completion_length": 328.50001525878906, "epoch": 0.192, "grad_norm": 1.0753885525580353, "kl": 1.913309097290039e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.2568899691104889, "reward_std": 0.4477514177560806, "rewards/correct_code_reward_func": 0.0625, "rewards/len_reward_func": 0.1943899691104889, "step": 12 }, { "completion_length": 576.0208435058594, "epoch": 0.208, "grad_norm": 0.6665638976585759, "kl": 1.4901161193847656e-05, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "reward": 0.4603695869445801, "reward_std": 0.5321745276451111, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": -0.018797069787979126, "step": 13 }, { "completion_length": 372.1458435058594, "epoch": 0.224, "grad_norm": 0.5376139925261669, "kl": -7.539987564086914e-06, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.35033082216978073, "reward_std": 0.4991242587566376, "rewards/correct_code_reward_func": 0.1875000111758709, "rewards/len_reward_func": 0.16283082962036133, "step": 14 }, { "completion_length": 368.0625, "epoch": 0.24, "grad_norm": 0.7714093926878073, "kl": -1.0609626770019531e-05, "learning_rate": 5e-07, "loss": -0.0, "reward": 0.5642553567886353, "reward_std": 0.4422197937965393, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.18925533443689346, "step": 15 }, { "completion_length": 355.3125, "epoch": 0.256, "grad_norm": 5.105471931223821, "kl": 0.00017064809799194336, "learning_rate": 4.999947552503497e-07, "loss": 0.0, "reward": 0.26038650423288345, "reward_std": 0.4197434335947037, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.13538648188114166, "step": 16 }, { "completion_length": 412.62501525878906, "epoch": 0.272, "grad_norm": 2.3022353306620618, "kl": 7.212162017822266e-05, "learning_rate": 4.999790212214579e-07, "loss": 0.0, "reward": 0.2577357590198517, "reward_std": 0.4265839755535126, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.09106908613466658, "step": 17 }, { "completion_length": 378.3333435058594, "epoch": 0.288, "grad_norm": 0.4875991126449715, "kl": -3.3736228942871094e-05, "learning_rate": 4.999527985734931e-07, "loss": -0.0, "reward": 0.4722817540168762, "reward_std": 0.5266325920820236, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.18061506003141403, "step": 18 }, { "completion_length": 344.5208435058594, "epoch": 0.304, "grad_norm": 1.0396038838546207, "kl": 0.00010180473327636719, "learning_rate": 4.99916088406705e-07, "loss": 0.0, "reward": 0.3391585648059845, "reward_std": 0.5373022556304932, "rewards/correct_code_reward_func": 0.2291666679084301, "rewards/len_reward_func": 0.10999187082052231, "step": 19 }, { "completion_length": 451.8958435058594, "epoch": 0.32, "grad_norm": 2.947144360733391, "kl": 0.0004801750183105469, "learning_rate": 4.998688922613787e-07, "loss": 0.0, "reward": 0.25417882204055786, "reward_std": 0.3951431214809418, "rewards/correct_code_reward_func": 0.1458333395421505, "rewards/len_reward_func": 0.10834548436105251, "step": 20 }, { "completion_length": 351.14583587646484, "epoch": 0.336, "grad_norm": 0.3323592751582949, "kl": 6.604194641113281e-05, "learning_rate": 4.998112121177698e-07, "loss": 0.0, "reward": 0.533460944890976, "reward_std": 0.528270423412323, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.15846090763807297, "step": 21 }, { "completion_length": 350.62501525878906, "epoch": 0.352, "grad_norm": 0.48509498294245135, "kl": 0.00014066696166992188, "learning_rate": 4.997430503960219e-07, "loss": 0.0, "reward": 0.4158083647489548, "reward_std": 0.5351734161376953, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.16580835729837418, "step": 22 }, { "completion_length": 392.43751525878906, "epoch": 0.368, "grad_norm": 1.106216791589586, "kl": 0.0010752677917480469, "learning_rate": 4.996644099560641e-07, "loss": 0.0, "reward": 0.3105452209711075, "reward_std": 0.45272597670555115, "rewards/correct_code_reward_func": 0.14583333395421505, "rewards/len_reward_func": 0.16471190005540848, "step": 23 }, { "completion_length": 575.0833740234375, "epoch": 0.384, "grad_norm": 0.5249553524839714, "kl": 0.0004450082778930664, "learning_rate": 4.995752940974918e-07, "loss": 0.0, "reward": 0.13311870768666267, "reward_std": 0.46341927349567413, "rewards/correct_code_reward_func": 0.1041666679084301, "rewards/len_reward_func": 0.028952032327651978, "step": 24 }, { "completion_length": 332.4583435058594, "epoch": 0.4, "grad_norm": 0.8221122197576101, "kl": 0.0007367134094238281, "learning_rate": 4.994757065594279e-07, "loss": 0.0, "reward": 0.23186111450195312, "reward_std": 0.45240356028079987, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.19019444286823273, "step": 25 }, { "completion_length": 492.22918701171875, "epoch": 0.416, "grad_norm": 0.6644343017539492, "kl": 0.00038242340087890625, "learning_rate": 4.993656515203662e-07, "loss": 0.0, "reward": 0.3881392180919647, "reward_std": 0.45405760407447815, "rewards/correct_code_reward_func": 0.291666679084301, "rewards/len_reward_func": 0.09647253155708313, "step": 26 }, { "completion_length": 404.875, "epoch": 0.432, "grad_norm": 1.430989391876855, "kl": 0.0005321502685546875, "learning_rate": 4.992451335979955e-07, "loss": 0.0, "reward": 0.5776854753494263, "reward_std": 0.41993752121925354, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.16101885586977005, "step": 27 }, { "completion_length": 399.79168701171875, "epoch": 0.448, "grad_norm": 0.6573879108863315, "kl": 0.000904083251953125, "learning_rate": 4.991141578490066e-07, "loss": 0.0, "reward": 0.5069835484027863, "reward_std": 0.4736175239086151, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.19448353350162506, "step": 28 }, { "completion_length": 312.31251525878906, "epoch": 0.464, "grad_norm": 1.8201559702600916, "kl": 0.00235748291015625, "learning_rate": 4.989727297688796e-07, "loss": 0.0, "reward": 0.35250288248062134, "reward_std": 0.4461039751768112, "rewards/correct_code_reward_func": 0.1458333395421505, "rewards/len_reward_func": 0.20666955411434174, "step": 29 }, { "completion_length": 349.7083435058594, "epoch": 0.48, "grad_norm": 0.5301714114961311, "kl": 0.000766754150390625, "learning_rate": 4.988208552916535e-07, "loss": 0.0, "reward": 0.4679892510175705, "reward_std": 0.3862190693616867, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.1763225868344307, "step": 30 }, { "completion_length": 450.2916717529297, "epoch": 0.496, "grad_norm": 0.9601897979350579, "kl": 0.0031280517578125, "learning_rate": 4.986585407896771e-07, "loss": 0.0, "reward": 0.5194195210933685, "reward_std": 0.5953386127948761, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.12358617037534714, "step": 31 }, { "completion_length": 310.75001525878906, "epoch": 0.512, "grad_norm": 1.6987129262106306, "kl": 0.006591796875, "learning_rate": 4.984857930733419e-07, "loss": 0.0, "reward": 0.41354838013648987, "reward_std": 0.47000962495803833, "rewards/correct_code_reward_func": 0.2500000149011612, "rewards/len_reward_func": 0.16354837268590927, "step": 32 }, { "completion_length": 383.25001525878906, "epoch": 0.528, "grad_norm": 0.7545967354246785, "kl": 0.0016632080078125, "learning_rate": 4.98302619390796e-07, "loss": 0.0, "reward": 0.46299508213996887, "reward_std": 0.5469619035720825, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.15049506723880768, "step": 33 }, { "completion_length": 386.97918701171875, "epoch": 0.544, "grad_norm": 0.40693320383732745, "kl": 0.0008487701416015625, "learning_rate": 4.981090274276405e-07, "loss": 0.0, "reward": 0.36661335825920105, "reward_std": 0.599495530128479, "rewards/correct_code_reward_func": 0.20833333395421505, "rewards/len_reward_func": 0.15827999636530876, "step": 34 }, { "completion_length": 364.2083435058594, "epoch": 0.56, "grad_norm": 3.038529641996517, "kl": 0.013214111328125, "learning_rate": 4.979050253066063e-07, "loss": 0.0, "reward": 0.38925309479236603, "reward_std": 0.4519564062356949, "rewards/correct_code_reward_func": 0.1875000111758709, "rewards/len_reward_func": 0.20175307989120483, "step": 35 }, { "completion_length": 223.20833587646484, "epoch": 0.576, "grad_norm": 2.194175778485753, "kl": 0.00901031494140625, "learning_rate": 4.976906215872137e-07, "loss": 0.0, "reward": 0.5131015926599503, "reward_std": 0.42744188010692596, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.24226826429367065, "step": 36 }, { "completion_length": 309.8958435058594, "epoch": 0.592, "grad_norm": 1.0307041561865542, "kl": 0.0041351318359375, "learning_rate": 4.974658252654134e-07, "loss": 0.0, "reward": 0.5382832139730453, "reward_std": 0.49117420613765717, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.16328320652246475, "step": 37 }, { "completion_length": 243.45833587646484, "epoch": 0.608, "grad_norm": 0.845257170990527, "kl": 0.005828857421875, "learning_rate": 4.97230645773209e-07, "loss": 0.0, "reward": 0.3587033599615097, "reward_std": 0.40543846786022186, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.2545367032289505, "step": 38 }, { "completion_length": 347.81251525878906, "epoch": 0.624, "grad_norm": 0.6831356250050794, "kl": 0.0037384033203125, "learning_rate": 4.96985092978261e-07, "loss": 0.0, "reward": 0.42208923399448395, "reward_std": 0.48525381088256836, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.13042253628373146, "step": 39 }, { "completion_length": 164.64583587646484, "epoch": 0.64, "grad_norm": 0.7868888492846406, "kl": 0.00518798828125, "learning_rate": 4.967291771834726e-07, "loss": 0.0, "reward": 0.37410612404346466, "reward_std": 0.42655548453330994, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.26993944495916367, "step": 40 }, { "completion_length": 176.70833587646484, "epoch": 0.656, "grad_norm": 0.9834593603969678, "kl": 0.011749267578125, "learning_rate": 4.964629091265583e-07, "loss": 0.0, "reward": 0.493656724691391, "reward_std": 0.32837581634521484, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.285323366522789, "step": 41 }, { "completion_length": 193.7916717529297, "epoch": 0.672, "grad_norm": 0.4763159825214015, "kl": 0.0091552734375, "learning_rate": 4.961862999795923e-07, "loss": 0.0, "reward": 0.43107903003692627, "reward_std": 0.3805217146873474, "rewards/correct_code_reward_func": 0.10416666977107525, "rewards/len_reward_func": 0.3269123286008835, "step": 42 }, { "completion_length": 251.31250762939453, "epoch": 0.688, "grad_norm": 0.926543039135598, "kl": 0.007232666015625, "learning_rate": 4.958993613485405e-07, "loss": 0.0, "reward": 0.47894081473350525, "reward_std": 0.501429408788681, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.16644081473350525, "step": 43 }, { "completion_length": 277.06251525878906, "epoch": 0.704, "grad_norm": 0.4442878657111374, "kl": 0.0048065185546875, "learning_rate": 4.956021052727731e-07, "loss": 0.0, "reward": 0.4777670353651047, "reward_std": 0.4489281326532364, "rewards/correct_code_reward_func": 0.2500000149011612, "rewards/len_reward_func": 0.22776702046394348, "step": 44 }, { "completion_length": 236.25, "epoch": 0.72, "grad_norm": 0.8268091601529216, "kl": 0.0167388916015625, "learning_rate": 4.952945442245597e-07, "loss": 0.0, "reward": 0.3185321241617203, "reward_std": 0.3292020410299301, "rewards/correct_code_reward_func": 0.06250000186264515, "rewards/len_reward_func": 0.2560321241617203, "step": 45 }, { "completion_length": 255.20834350585938, "epoch": 0.736, "grad_norm": 0.5352983780133181, "kl": 0.0109405517578125, "learning_rate": 4.949766911085461e-07, "loss": 0.0, "reward": 0.40212514996528625, "reward_std": 0.4124213755130768, "rewards/correct_code_reward_func": 0.16666667722165585, "rewards/len_reward_func": 0.23545847833156586, "step": 46 }, { "completion_length": 206.18750762939453, "epoch": 0.752, "grad_norm": 1.0989971408848502, "kl": 0.012786865234375, "learning_rate": 4.946485592612122e-07, "loss": 0.0, "reward": 0.5097324252128601, "reward_std": 0.44956831634044647, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.23889903724193573, "step": 47 }, { "completion_length": 129.08333587646484, "epoch": 0.768, "grad_norm": 1.030854950339047, "kl": 0.03948974609375, "learning_rate": 4.943101624503132e-07, "loss": 0.0, "reward": 0.616540938615799, "reward_std": 0.528562068939209, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.24154090136289597, "step": 48 }, { "completion_length": 126.66667175292969, "epoch": 0.784, "grad_norm": 1.2775292444091597, "kl": 0.0452880859375, "learning_rate": 4.939615148743017e-07, "loss": 0.0, "reward": 0.5280424952507019, "reward_std": 0.5158527791500092, "rewards/correct_code_reward_func": 0.2708333395421505, "rewards/len_reward_func": 0.2572091445326805, "step": 49 }, { "completion_length": 128.93750381469727, "epoch": 0.8, "grad_norm": 0.9671908865335344, "kl": 0.025238037109375, "learning_rate": 4.936026311617316e-07, "loss": 0.0, "reward": 0.4106781929731369, "reward_std": 0.4047301709651947, "rewards/correct_code_reward_func": 0.12500000558793545, "rewards/len_reward_func": 0.2856782227754593, "step": 50 }, { "completion_length": 173.31250762939453, "epoch": 0.816, "grad_norm": 0.7968037935478003, "kl": 0.0194091796875, "learning_rate": 4.932335263706445e-07, "loss": 0.0, "reward": 0.6258751451969147, "reward_std": 0.4580596834421158, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.16754178702831268, "step": 51 }, { "completion_length": 141.7083396911621, "epoch": 0.832, "grad_norm": 1.006183031163246, "kl": 0.03253173828125, "learning_rate": 4.928542159879385e-07, "loss": 0.0, "reward": 0.6714096814393997, "reward_std": 0.5154012739658356, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.21307633817195892, "step": 52 }, { "completion_length": 91.47916793823242, "epoch": 0.848, "grad_norm": 1.4433890659727795, "kl": 0.0452880859375, "learning_rate": 4.924647159287175e-07, "loss": 0.0, "reward": 0.3492863178253174, "reward_std": 0.31160949170589447, "rewards/correct_code_reward_func": 0.02083333395421505, "rewards/len_reward_func": 0.3284529894590378, "step": 53 }, { "completion_length": 89.06250381469727, "epoch": 0.864, "grad_norm": 1.244678229537496, "kl": 0.0367431640625, "learning_rate": 4.920650425356239e-07, "loss": 0.0, "reward": 0.4604046642780304, "reward_std": 0.3904419094324112, "rewards/correct_code_reward_func": 0.10416666977107525, "rewards/len_reward_func": 0.3562380075454712, "step": 54 }, { "completion_length": 89.93750381469727, "epoch": 0.88, "grad_norm": 0.9722090267696216, "kl": 0.0489501953125, "learning_rate": 4.916552125781528e-07, "loss": 0.0, "reward": 0.5890590101480484, "reward_std": 0.40695688128471375, "rewards/correct_code_reward_func": 0.27083333395421505, "rewards/len_reward_func": 0.3182256817817688, "step": 55 }, { "completion_length": 129.1666717529297, "epoch": 0.896, "grad_norm": 0.8934927590767672, "kl": 0.031707763671875, "learning_rate": 4.912352432519484e-07, "loss": 0.0, "reward": 0.43804308772087097, "reward_std": 0.3997139036655426, "rewards/correct_code_reward_func": 0.1458333395421505, "rewards/len_reward_func": 0.2922097444534302, "step": 56 }, { "completion_length": 85.87500381469727, "epoch": 0.912, "grad_norm": 1.059356454998566, "kl": 0.0428466796875, "learning_rate": 4.908051521780824e-07, "loss": 0.0, "reward": 0.46079379320144653, "reward_std": 0.5179382562637329, "rewards/correct_code_reward_func": 0.1875000111758709, "rewards/len_reward_func": 0.2732938081026077, "step": 57 }, { "completion_length": 56.1875, "epoch": 0.928, "grad_norm": 1.5024432197845348, "kl": 0.08740234375, "learning_rate": 4.90364957402315e-07, "loss": 0.0001, "reward": 0.4640047252178192, "reward_std": 0.4288184642791748, "rewards/correct_code_reward_func": 0.1875000074505806, "rewards/len_reward_func": 0.2765047252178192, "step": 58 }, { "completion_length": 73.39583396911621, "epoch": 0.944, "grad_norm": 1.7650779542824517, "kl": 0.10009765625, "learning_rate": 4.899146773943373e-07, "loss": 0.0001, "reward": 0.5490352362394333, "reward_std": 0.31501778960227966, "rewards/correct_code_reward_func": 0.1875, "rewards/len_reward_func": 0.3615352660417557, "step": 59 }, { "completion_length": 73.18750190734863, "epoch": 0.96, "grad_norm": 0.7172587724443913, "kl": 0.0584716796875, "learning_rate": 4.894543310469967e-07, "loss": 0.0001, "reward": 0.4920351505279541, "reward_std": 0.4270896017551422, "rewards/correct_code_reward_func": 0.2291666716337204, "rewards/len_reward_func": 0.2628684788942337, "step": 60 }, { "completion_length": 101.14583969116211, "epoch": 0.976, "grad_norm": 0.9692991425732275, "kl": 0.077880859375, "learning_rate": 4.88983937675504e-07, "loss": 0.0001, "reward": 0.5707934498786926, "reward_std": 0.4804637283086777, "rewards/correct_code_reward_func": 0.25000000558793545, "rewards/len_reward_func": 0.3207934498786926, "step": 61 }, { "completion_length": 90.43750381469727, "epoch": 0.992, "grad_norm": 0.9307526215492377, "kl": 0.0550537109375, "learning_rate": 4.885035170166228e-07, "loss": 0.0001, "reward": 0.4401155561208725, "reward_std": 0.2671549841761589, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.3359488919377327, "step": 62 }, { "completion_length": 79.08333587646484, "epoch": 1.0, "grad_norm": 0.9307526215492377, "kl": 0.1474609375, "learning_rate": 4.880130892278419e-07, "loss": 0.0001, "reward": 0.4844928979873657, "reward_std": 0.3104479908943176, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.4011596143245697, "step": 63 }, { "completion_length": 65.97916793823242, "epoch": 1.016, "grad_norm": 1.2251318507150297, "kl": 0.09423828125, "learning_rate": 4.875126748865289e-07, "loss": 0.0001, "reward": 0.5488586723804474, "reward_std": 0.3761359751224518, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.3405253440141678, "step": 64 }, { "completion_length": 80.97916793823242, "epoch": 1.032, "grad_norm": 1.7067808615741822, "kl": 0.0869140625, "learning_rate": 4.870022949890676e-07, "loss": 0.0001, "reward": 0.47587963938713074, "reward_std": 0.40881581604480743, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.3508796691894531, "step": 65 }, { "completion_length": 68.125, "epoch": 1.048, "grad_norm": 0.9894209736736024, "kl": 0.08544921875, "learning_rate": 4.864819709499761e-07, "loss": 0.0001, "reward": 0.5112280547618866, "reward_std": 0.39189808815717697, "rewards/correct_code_reward_func": 0.1666666679084301, "rewards/len_reward_func": 0.3445614129304886, "step": 66 }, { "completion_length": 82.16666793823242, "epoch": 1.064, "grad_norm": 1.0146554350650319, "kl": 0.13720703125, "learning_rate": 4.85951724601009e-07, "loss": 0.0001, "reward": 0.5718176364898682, "reward_std": 0.41909658908843994, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.32181763648986816, "step": 67 }, { "completion_length": 67.18750190734863, "epoch": 1.08, "grad_norm": 1.209065945657589, "kl": 0.13037109375, "learning_rate": 4.854115781902414e-07, "loss": 0.0001, "reward": 0.38857993483543396, "reward_std": 0.37996064126491547, "rewards/correct_code_reward_func": 0.0625, "rewards/len_reward_func": 0.32607994973659515, "step": 68 }, { "completion_length": 63.9375, "epoch": 1.096, "grad_norm": 1.0015586737181452, "kl": 0.121337890625, "learning_rate": 4.848615543811344e-07, "loss": 0.0001, "reward": 0.6716190874576569, "reward_std": 0.3675233870744705, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.37995241582393646, "step": 69 }, { "completion_length": 45.00000190734863, "epoch": 1.112, "grad_norm": 1.3746237501719711, "kl": 0.166015625, "learning_rate": 4.843016762515859e-07, "loss": 0.0002, "reward": 0.38038890063762665, "reward_std": 0.32317543029785156, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.33872224390506744, "step": 70 }, { "completion_length": 57.395835876464844, "epoch": 1.1280000000000001, "grad_norm": 2.2658811424218266, "kl": 0.086669921875, "learning_rate": 4.837319672929606e-07, "loss": 0.0001, "reward": 0.4675743877887726, "reward_std": 0.29566246271133423, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.363407701253891, "step": 71 }, { "completion_length": 39.66666793823242, "epoch": 1.144, "grad_norm": 0.9853509970570241, "kl": 0.1171875, "learning_rate": 4.831524514091056e-07, "loss": 0.0001, "reward": 0.5477184951305389, "reward_std": 0.4072943925857544, "rewards/correct_code_reward_func": 0.1875000074505806, "rewards/len_reward_func": 0.36021851003170013, "step": 72 }, { "completion_length": 40.10416793823242, "epoch": 1.16, "grad_norm": 2.4194323860380593, "kl": 0.18359375, "learning_rate": 4.825631529153466e-07, "loss": 0.0002, "reward": 0.49018459022045135, "reward_std": 0.25917236506938934, "rewards/correct_code_reward_func": 0.16666667722165585, "rewards/len_reward_func": 0.32351788878440857, "step": 73 }, { "completion_length": 29.562500953674316, "epoch": 1.176, "grad_norm": 1.687397775503762, "kl": 0.40771484375, "learning_rate": 4.81964096537468e-07, "loss": 0.0004, "reward": 0.43547162413597107, "reward_std": 0.2647310718894005, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.39380495250225067, "step": 74 }, { "completion_length": 40.16666793823242, "epoch": 1.192, "grad_norm": 1.1816855995040336, "kl": 0.146484375, "learning_rate": 4.81355307410676e-07, "loss": 0.0001, "reward": 0.5548797100782394, "reward_std": 0.3682420402765274, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.3465464115142822, "step": 75 }, { "completion_length": 93.93750190734863, "epoch": 1.208, "grad_norm": 1.2669964512881071, "kl": 0.2099609375, "learning_rate": 4.80736811078543e-07, "loss": 0.0002, "reward": 0.4997504949569702, "reward_std": 0.3275124281644821, "rewards/correct_code_reward_func": 0.12500000558793545, "rewards/len_reward_func": 0.3747505098581314, "step": 76 }, { "completion_length": 36.79166793823242, "epoch": 1.224, "grad_norm": 1.1317327261317542, "kl": 0.153564453125, "learning_rate": 4.80108633491936e-07, "loss": 0.0002, "reward": 0.43036508560180664, "reward_std": 0.27128875255584717, "rewards/correct_code_reward_func": 0.1250000037252903, "rewards/len_reward_func": 0.30536508560180664, "step": 77 }, { "completion_length": 53.70833396911621, "epoch": 1.24, "grad_norm": 1.5731589655579585, "kl": 0.19970703125, "learning_rate": 4.794708010079288e-07, "loss": 0.0002, "reward": 0.5270055830478668, "reward_std": 0.35258112847805023, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.36033889651298523, "step": 78 }, { "completion_length": 44.37500190734863, "epoch": 1.256, "grad_norm": 1.0496022486728145, "kl": 0.220703125, "learning_rate": 4.788233403886949e-07, "loss": 0.0002, "reward": 0.48745329678058624, "reward_std": 0.1396162062883377, "rewards/correct_code_reward_func": 0.06250000186264515, "rewards/len_reward_func": 0.42495329678058624, "step": 79 }, { "completion_length": 27.291666984558105, "epoch": 1.272, "grad_norm": 1.0530921922250815, "kl": 0.216796875, "learning_rate": 4.78166278800385e-07, "loss": 0.0002, "reward": 0.579190120100975, "reward_std": 0.34055350720882416, "rewards/correct_code_reward_func": 0.2083333395421505, "rewards/len_reward_func": 0.37085679173469543, "step": 80 }, { "completion_length": 32.50000190734863, "epoch": 1.288, "grad_norm": 1.18045709904702, "kl": 0.19189453125, "learning_rate": 4.774996438119876e-07, "loss": 0.0002, "reward": 0.6207249760627747, "reward_std": 0.377775639295578, "rewards/correct_code_reward_func": 0.1875, "rewards/len_reward_func": 0.43322494626045227, "step": 81 }, { "completion_length": 23.979166984558105, "epoch": 1.304, "grad_norm": 1.6753165605296896, "kl": 0.21875, "learning_rate": 4.7682346339417157e-07, "loss": 0.0002, "reward": 0.5133648067712784, "reward_std": 0.12423056736588478, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.471698135137558, "step": 82 }, { "completion_length": 41.10416793823242, "epoch": 1.32, "grad_norm": 1.5309365229407714, "kl": 0.23876953125, "learning_rate": 4.7613776591811295e-07, "loss": 0.0002, "reward": 0.5157457143068314, "reward_std": 0.17486733943223953, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.41157902777194977, "step": 83 }, { "completion_length": 24.70833396911621, "epoch": 1.336, "grad_norm": 1.2440691306439626, "kl": 0.388671875, "learning_rate": 4.754425801543046e-07, "loss": 0.0004, "reward": 0.46331432461738586, "reward_std": 0.17115781363099813, "rewards/correct_code_reward_func": 0.06250000186264515, "rewards/len_reward_func": 0.4008142799139023, "step": 84 }, { "completion_length": 40.08333396911621, "epoch": 1.3519999999999999, "grad_norm": 0.9581765714522964, "kl": 0.18310546875, "learning_rate": 4.747379352713488e-07, "loss": 0.0002, "reward": 0.5128517299890518, "reward_std": 0.13994156941771507, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.4086850583553314, "step": 85 }, { "completion_length": 28.6875, "epoch": 1.3679999999999999, "grad_norm": 1.5448289083630962, "kl": 0.203125, "learning_rate": 4.7402386083473364e-07, "loss": 0.0002, "reward": 0.6120259165763855, "reward_std": 0.24722883105278015, "rewards/correct_code_reward_func": 0.1875, "rewards/len_reward_func": 0.4245258867740631, "step": 86 }, { "completion_length": 44.75, "epoch": 1.384, "grad_norm": 1.3501042765686608, "kl": 0.1884765625, "learning_rate": 4.7330038680559224e-07, "loss": 0.0002, "reward": 0.4797859787940979, "reward_std": 0.336033895611763, "rewards/correct_code_reward_func": 0.10416666977107525, "rewards/len_reward_func": 0.3756193071603775, "step": 87 }, { "completion_length": 21.354166984558105, "epoch": 1.4, "grad_norm": 1.4869496350604456, "kl": 0.30322265625, "learning_rate": 4.72567543539446e-07, "loss": 0.0003, "reward": 0.4333198815584183, "reward_std": 0.28416211903095245, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.3916532099246979, "step": 88 }, { "completion_length": 27.58333396911621, "epoch": 1.416, "grad_norm": 1.6228999757156357, "kl": 0.3466796875, "learning_rate": 4.718253617849305e-07, "loss": 0.0003, "reward": 0.5190460979938507, "reward_std": 0.24678421020507812, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.4148794263601303, "step": 89 }, { "completion_length": 30.895834922790527, "epoch": 1.432, "grad_norm": 1.3213740664946243, "kl": 0.2158203125, "learning_rate": 4.7107387268250586e-07, "loss": 0.0002, "reward": 0.5004410147666931, "reward_std": 0.31192412972450256, "rewards/correct_code_reward_func": 0.1458333395421505, "rewards/len_reward_func": 0.3546076714992523, "step": 90 }, { "completion_length": 37.60416793823242, "epoch": 1.448, "grad_norm": 1.579120317520106, "kl": 0.25048828125, "learning_rate": 4.703131077631497e-07, "loss": 0.0002, "reward": 0.6014063358306885, "reward_std": 0.07078037410974503, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.4555730074644089, "step": 91 }, { "completion_length": 24.229166984558105, "epoch": 1.464, "grad_norm": 0.7584124455777198, "kl": 0.248046875, "learning_rate": 4.6954309894703426e-07, "loss": 0.0002, "reward": 0.4590907543897629, "reward_std": 0.16932503879070282, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.4174240827560425, "step": 92 }, { "completion_length": 30.64583396911621, "epoch": 1.48, "grad_norm": 0.0060252039125188945, "kl": 0.17919921875, "learning_rate": 4.6876387854218744e-07, "loss": 0.0001, "reward": 0.46853742003440857, "reward_std": 0.08898964250827746, "rewards/correct_code_reward_func": 0.06250000186264515, "rewards/len_reward_func": 0.40603742003440857, "step": 93 }, { "completion_length": 36.25000190734863, "epoch": 1.496, "grad_norm": 0.8958155592124936, "kl": 0.19775390625, "learning_rate": 4.6797547924313673e-07, "loss": 0.0002, "reward": 0.44910070300102234, "reward_std": 0.24745193123817444, "rewards/correct_code_reward_func": 0.02083333395421505, "rewards/len_reward_func": 0.42826738953590393, "step": 94 }, { "completion_length": 40.35416793823242, "epoch": 1.512, "grad_norm": 1.5558035278415578, "kl": 0.21337890625, "learning_rate": 4.6717793412953776e-07, "loss": 0.0002, "reward": 0.44604572653770447, "reward_std": 0.30530666559934616, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.40437906980514526, "step": 95 }, { "completion_length": 35.291666984558105, "epoch": 1.528, "grad_norm": 1.4321261431714865, "kl": 0.39892578125, "learning_rate": 4.6637127666478617e-07, "loss": 0.0004, "reward": 0.46982091665267944, "reward_std": 0.08393960446119308, "rewards/correct_code_reward_func": 0.02083333395421505, "rewards/len_reward_func": 0.44898758828639984, "step": 96 }, { "completion_length": 24.354166984558105, "epoch": 1.544, "grad_norm": 0.929795596193928, "kl": 0.29443359375, "learning_rate": 4.6555554069461346e-07, "loss": 0.0003, "reward": 0.46133650839328766, "reward_std": 0.2364010475575924, "rewards/correct_code_reward_func": 0.0625, "rewards/len_reward_func": 0.39883650839328766, "step": 97 }, { "completion_length": 28.95833396911621, "epoch": 1.56, "grad_norm": 0.6658422780899015, "kl": 0.31982421875, "learning_rate": 4.647307604456674e-07, "loss": 0.0003, "reward": 0.5673407018184662, "reward_std": 0.12253111600875854, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.4631740301847458, "step": 98 }, { "completion_length": 15.770833969116211, "epoch": 1.576, "grad_norm": 2.493104620020484, "kl": 0.4345703125, "learning_rate": 4.6389697052407526e-07, "loss": 0.0004, "reward": 0.591472789645195, "reward_std": 0.3403037488460541, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.4248061031103134, "step": 99 }, { "completion_length": 18.000000953674316, "epoch": 1.592, "grad_norm": 1.652920479976726, "kl": 0.48828125, "learning_rate": 4.630542059139923e-07, "loss": 0.0005, "reward": 0.4733283668756485, "reward_std": 0.07313242554664612, "rewards/correct_code_reward_func": 0.0, "rewards/len_reward_func": 0.4733283668756485, "step": 100 }, { "completion_length": 24.104166984558105, "epoch": 1.608, "grad_norm": 0.6637104853285745, "kl": 0.3203125, "learning_rate": 4.622025019761336e-07, "loss": 0.0003, "reward": 0.6387104988098145, "reward_std": 0.21521971747279167, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.43037715554237366, "step": 101 }, { "completion_length": 23.687500953674316, "epoch": 1.624, "grad_norm": 1.096819799191345, "kl": 0.3330078125, "learning_rate": 4.613418944462906e-07, "loss": 0.0003, "reward": 0.5115740746259689, "reward_std": 0.20615240186452866, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.42824074625968933, "step": 102 }, { "completion_length": 18.291667938232422, "epoch": 1.6400000000000001, "grad_norm": 0.7425841429065823, "kl": 0.4208984375, "learning_rate": 4.6047241943383173e-07, "loss": 0.0004, "reward": 0.5208333432674408, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.06250000186264515, "rewards/len_reward_func": 0.4583333432674408, "step": 103 }, { "completion_length": 20.1875, "epoch": 1.6560000000000001, "grad_norm": 1.0672134971206753, "kl": 0.2548828125, "learning_rate": 4.5959411342018704e-07, "loss": 0.0003, "reward": 0.5686589479446411, "reward_std": 0.1569155752658844, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.42282557487487793, "step": 104 }, { "completion_length": 18.89583396911621, "epoch": 1.6720000000000002, "grad_norm": 1.9534809364444377, "kl": 0.4580078125, "learning_rate": 4.5870701325731773e-07, "loss": 0.0005, "reward": 0.5205335766077042, "reward_std": 0.14886049553751945, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.47886690497398376, "step": 105 }, { "completion_length": 30.854167938232422, "epoch": 1.688, "grad_norm": 1.886180541460537, "kl": 0.2861328125, "learning_rate": 4.578111561661702e-07, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.1753452718257904, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.4791666716337204, "step": 106 }, { "completion_length": 22.979166984558105, "epoch": 1.704, "grad_norm": 1.3514702193989006, "kl": 0.3154296875, "learning_rate": 4.569065797351135e-07, "loss": 0.0003, "reward": 0.614065021276474, "reward_std": 0.30389876663684845, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.4057316929101944, "step": 107 }, { "completion_length": 22.14583396911621, "epoch": 1.72, "grad_norm": 1.3204173636971241, "kl": 0.359375, "learning_rate": 4.559933219183631e-07, "loss": 0.0004, "reward": 0.5397135615348816, "reward_std": 0.11889010295271873, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.498046875, "step": 108 }, { "completion_length": 16.000000476837158, "epoch": 1.736, "grad_norm": 1.358034882182442, "kl": 0.4677734375, "learning_rate": 4.550714210343879e-07, "loss": 0.0005, "reward": 0.7275510132312775, "reward_std": 0.3622310161590576, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.39421766996383667, "step": 109 }, { "completion_length": 23.14583396911621, "epoch": 1.752, "grad_norm": 1.916868694407283, "kl": 0.3154296875, "learning_rate": 4.541409157643027e-07, "loss": 0.0003, "reward": 0.4378484785556793, "reward_std": 0.12258240953087807, "rewards/correct_code_reward_func": 0.0, "rewards/len_reward_func": 0.4378484785556793, "step": 110 }, { "completion_length": 13.75, "epoch": 1.768, "grad_norm": 1.6380973978730606, "kl": 0.513671875, "learning_rate": 4.5320184515024493e-07, "loss": 0.0005, "reward": 0.6193452775478363, "reward_std": 0.22009535133838654, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.4943452626466751, "step": 111 }, { "completion_length": 39.854166984558105, "epoch": 1.784, "grad_norm": 3.8105256272979995, "kl": 0.2919921875, "learning_rate": 4.5225424859373684e-07, "loss": 0.0003, "reward": 0.523853987455368, "reward_std": 0.16823304444551468, "rewards/correct_code_reward_func": 0.06250000186264515, "rewards/len_reward_func": 0.46135397255420685, "step": 112 }, { "completion_length": 15.291666984558105, "epoch": 1.8, "grad_norm": 1.1650221360198958, "kl": 0.541015625, "learning_rate": 4.51298165854032e-07, "loss": 0.0005, "reward": 0.5442352294921875, "reward_std": 0.06456775963306427, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.4609019011259079, "step": 113 }, { "completion_length": 20.625, "epoch": 1.8159999999999998, "grad_norm": 0.020013783247992148, "kl": 0.3828125, "learning_rate": 4.503336370464475e-07, "loss": 0.0004, "reward": 0.5208333432674408, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.4791666716337204, "step": 114 }, { "completion_length": 18.437500953674316, "epoch": 1.8319999999999999, "grad_norm": 0.8327328617247051, "kl": 0.396484375, "learning_rate": 4.4936070264068016e-07, "loss": 0.0004, "reward": 0.6041666716337204, "reward_std": 0.2620653882622719, "rewards/correct_code_reward_func": 0.1875, "rewards/len_reward_func": 0.4166666716337204, "step": 115 }, { "completion_length": 23.64583396911621, "epoch": 1.8479999999999999, "grad_norm": 1.683115986414425, "kl": 0.34375, "learning_rate": 4.4837940345910917e-07, "loss": 0.0003, "reward": 0.539160430431366, "reward_std": 0.08424048312008381, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.45582708716392517, "step": 116 }, { "completion_length": 17.479166984558105, "epoch": 1.8639999999999999, "grad_norm": 0.9874831743158241, "kl": 0.3681640625, "learning_rate": 4.473897806750828e-07, "loss": 0.0003, "reward": 0.47979801893234253, "reward_std": 0.1770581193268299, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.39646467566490173, "step": 117 }, { "completion_length": 17.791666984558105, "epoch": 1.88, "grad_norm": 1.4038513263647607, "kl": 0.3759765625, "learning_rate": 4.4639187581119116e-07, "loss": 0.0004, "reward": 0.6913580298423767, "reward_std": 0.26719751954078674, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.4413580149412155, "step": 118 }, { "completion_length": 19.08333396911621, "epoch": 1.896, "grad_norm": 2.278605940745065, "kl": 0.3603515625, "learning_rate": 4.453857307375236e-07, "loss": 0.0004, "reward": 0.5320361256599426, "reward_std": 0.10937795042991638, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.4903694689273834, "step": 119 }, { "completion_length": 20.166666984558105, "epoch": 1.912, "grad_norm": 0.010837700850525291, "kl": 0.326171875, "learning_rate": 4.443713876699123e-07, "loss": 0.0003, "reward": 0.6041666865348816, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.5, "step": 120 }, { "completion_length": 18.916666984558105, "epoch": 1.928, "grad_norm": 4.137929355426566, "kl": 0.3173828125, "learning_rate": 4.433488891681609e-07, "loss": 0.0003, "reward": 0.7551863789558411, "reward_std": 0.3335232138633728, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.44268636405467987, "step": 121 }, { "completion_length": 30.500001907348633, "epoch": 1.944, "grad_norm": 1.9378965639286245, "kl": 0.2412109375, "learning_rate": 4.423182781342588e-07, "loss": 0.0002, "reward": 0.5831037163734436, "reward_std": 0.3190010190010071, "rewards/correct_code_reward_func": 0.1458333358168602, "rewards/len_reward_func": 0.4372703731060028, "step": 122 }, { "completion_length": 31.833334922790527, "epoch": 1.96, "grad_norm": 1.4092055403920425, "kl": 0.294921875, "learning_rate": 4.412795978105807e-07, "loss": 0.0003, "reward": 0.6458333432674408, "reward_std": 0.28126100823283195, "rewards/correct_code_reward_func": 0.14583333395421505, "rewards/len_reward_func": 0.5, "step": 123 }, { "completion_length": 20.75, "epoch": 1.976, "grad_norm": 1.8731345483243946, "kl": 0.3056640625, "learning_rate": 4.402328917780728e-07, "loss": 0.0003, "reward": 0.6243590116500854, "reward_std": 0.19208041578531265, "rewards/correct_code_reward_func": 0.1458333395421505, "rewards/len_reward_func": 0.47852563858032227, "step": 124 }, { "completion_length": 34.43750190734863, "epoch": 1.992, "grad_norm": 2.251696553247793, "kl": 0.17919921875, "learning_rate": 4.391782039544238e-07, "loss": 0.0002, "reward": 0.5416666716337204, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.0625, "rewards/len_reward_func": 0.4791666716337204, "step": 125 }, { "completion_length": 20.45833396911621, "epoch": 2.0, "grad_norm": 0.7665869817926465, "kl": 0.16015625, "learning_rate": 4.381155785922225e-07, "loss": 0.0001, "reward": 0.625, "reward_std": 0.17251639068126678, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.5, "step": 126 }, { "completion_length": 22.64583396911621, "epoch": 2.016, "grad_norm": 1.4642947533469586, "kl": 0.23876953125, "learning_rate": 4.37045060277101e-07, "loss": 0.0002, "reward": 0.7051360011100769, "reward_std": 0.33548664301633835, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.3926360011100769, "step": 127 }, { "completion_length": 25.791667938232422, "epoch": 2.032, "grad_norm": 2.2744596355142987, "kl": 0.23046875, "learning_rate": 4.3596669392586363e-07, "loss": 0.0002, "reward": 0.45531921088695526, "reward_std": 0.2537742704153061, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.41365256905555725, "step": 128 }, { "completion_length": 30.979167938232422, "epoch": 2.048, "grad_norm": 1.152083526235397, "kl": 0.23193359375, "learning_rate": 4.348805247846027e-07, "loss": 0.0002, "reward": 0.6339346170425415, "reward_std": 0.23110489547252655, "rewards/correct_code_reward_func": 0.20833333395421505, "rewards/len_reward_func": 0.4256012886762619, "step": 129 }, { "completion_length": 25.08333396911621, "epoch": 2.064, "grad_norm": 2.3077859885366028, "kl": 0.26806640625, "learning_rate": 4.337865984268001e-07, "loss": 0.0003, "reward": 0.6608805358409882, "reward_std": 0.2853649668395519, "rewards/correct_code_reward_func": 0.18750000558793545, "rewards/len_reward_func": 0.47338053584098816, "step": 130 }, { "completion_length": 29.854167938232422, "epoch": 2.08, "grad_norm": 1.3425058048706657, "kl": 0.197265625, "learning_rate": 4.326849607514148e-07, "loss": 0.0002, "reward": 0.5538955330848694, "reward_std": 0.28982797265052795, "rewards/correct_code_reward_func": 0.1041666679084301, "rewards/len_reward_func": 0.4497288465499878, "step": 131 }, { "completion_length": 30.70833396911621, "epoch": 2.096, "grad_norm": 2.173464158764514, "kl": 0.2490234375, "learning_rate": 4.3157565798095746e-07, "loss": 0.0002, "reward": 0.6434731781482697, "reward_std": 0.31796523183584213, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.43513981997966766, "step": 132 }, { "completion_length": 29.20833396911621, "epoch": 2.112, "grad_norm": 1.5233613223232554, "kl": 0.27880859375, "learning_rate": 4.304587366595505e-07, "loss": 0.0003, "reward": 0.6140289306640625, "reward_std": 0.17255835235118866, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.4473622739315033, "step": 133 }, { "completion_length": 34.08333396911621, "epoch": 2.128, "grad_norm": 2.653484217985997, "kl": 0.18212890625, "learning_rate": 4.293342436509756e-07, "loss": 0.0002, "reward": 0.6950471103191376, "reward_std": 0.298052042722702, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.4450470805168152, "step": 134 }, { "completion_length": 27.6875, "epoch": 2.144, "grad_norm": 1.7492354771411964, "kl": 0.216796875, "learning_rate": 4.282022261367073e-07, "loss": 0.0002, "reward": 0.48720741271972656, "reward_std": 0.2177068144083023, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.40387406945228577, "step": 135 }, { "completion_length": 32.25000190734863, "epoch": 2.16, "grad_norm": 1.1359022223853992, "kl": 0.21728515625, "learning_rate": 4.2706273161393326e-07, "loss": 0.0002, "reward": 0.6075980365276337, "reward_std": 0.19440394639968872, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.48259803652763367, "step": 136 }, { "completion_length": 19.77083396911621, "epoch": 2.176, "grad_norm": 6.664411559311778, "kl": 0.345703125, "learning_rate": 4.259158078935615e-07, "loss": 0.0003, "reward": 0.8192708492279053, "reward_std": 0.37694354355335236, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.4651041626930237, "step": 137 }, { "completion_length": 39.125, "epoch": 2.192, "grad_norm": 1.456816045019513, "kl": 0.21484375, "learning_rate": 4.2476150309821437e-07, "loss": 0.0002, "reward": 0.6840218156576157, "reward_std": 0.2802516594529152, "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.43402181565761566, "step": 138 }, { "completion_length": 33.187500953674316, "epoch": 2.208, "grad_norm": 1.3547877377100939, "kl": 0.134033203125, "learning_rate": 4.235998656602091e-07, "loss": 0.0001, "reward": 0.5814135670661926, "reward_std": 0.23970085382461548, "rewards/correct_code_reward_func": 0.10416666977107525, "rewards/len_reward_func": 0.47724688053131104, "step": 139 }, { "completion_length": 36.95833492279053, "epoch": 2.224, "grad_norm": 1.2720844709707944, "kl": 0.2001953125, "learning_rate": 4.2243094431952607e-07, "loss": 0.0002, "reward": 0.638367086648941, "reward_std": 0.2601431868970394, "rewards/correct_code_reward_func": 0.16666667722165585, "rewards/len_reward_func": 0.47170040011405945, "step": 140 }, { "completion_length": 39.02083396911621, "epoch": 2.24, "grad_norm": 2.1139624484094615, "kl": 0.16943359375, "learning_rate": 4.2125478812176363e-07, "loss": 0.0002, "reward": 0.5840575993061066, "reward_std": 0.21259387582540512, "rewards/correct_code_reward_func": 0.1250000037252903, "rewards/len_reward_func": 0.4590575695037842, "step": 141 }, { "completion_length": 18.14583396911621, "epoch": 2.2560000000000002, "grad_norm": 3.9393507845319564, "kl": 0.3291015625, "learning_rate": 4.2007144641608035e-07, "loss": 0.0003, "reward": 0.7043379247188568, "reward_std": 0.22067928314208984, "rewards/correct_code_reward_func": 0.229166679084301, "rewards/len_reward_func": 0.4751712381839752, "step": 142 }, { "completion_length": 21.312500476837158, "epoch": 2.2720000000000002, "grad_norm": 3.09831462467243, "kl": 0.34912109375, "learning_rate": 4.188809688531241e-07, "loss": 0.0003, "reward": 0.8270089626312256, "reward_std": 0.38452374935150146, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.4520089328289032, "step": 143 }, { "completion_length": 37.895835876464844, "epoch": 2.288, "grad_norm": 2.026442157947375, "kl": 0.178955078125, "learning_rate": 4.1768340538294914e-07, "loss": 0.0002, "reward": 0.6618677079677582, "reward_std": 0.22813905775547028, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.4535343796014786, "step": 144 }, { "completion_length": 20.625, "epoch": 2.304, "grad_norm": 5.079652549790452, "kl": 0.287109375, "learning_rate": 4.1647880625292027e-07, "loss": 0.0003, "reward": 0.9085739850997925, "reward_std": 0.4442262500524521, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.4502406120300293, "step": 145 }, { "completion_length": 26.375000953674316, "epoch": 2.32, "grad_norm": 8.765502969688386, "kl": 0.27099609375, "learning_rate": 4.1526722200560436e-07, "loss": 0.0003, "reward": 1.0286418199539185, "reward_std": 0.5029588490724564, "rewards/correct_code_reward_func": 0.5833333730697632, "rewards/len_reward_func": 0.44530846178531647, "step": 146 }, { "completion_length": 33.39583396911621, "epoch": 2.336, "grad_norm": 4.101562007961352, "kl": 0.1669921875, "learning_rate": 4.140487034766499e-07, "loss": 0.0002, "reward": 0.9583333432674408, "reward_std": 0.31142252683639526, "rewards/correct_code_reward_func": 0.479166679084301, "rewards/len_reward_func": 0.4791666716337204, "step": 147 }, { "completion_length": 33.250000953674316, "epoch": 2.352, "grad_norm": 1.3769018597142264, "kl": 0.23291015625, "learning_rate": 4.1282330179265377e-07, "loss": 0.0002, "reward": 0.728137880563736, "reward_std": 0.3594149053096771, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.47813786566257477, "step": 148 }, { "completion_length": 35.10416793823242, "epoch": 2.368, "grad_norm": 1.764712623454699, "kl": 0.187255859375, "learning_rate": 4.115910683690167e-07, "loss": 0.0002, "reward": 0.6034694612026215, "reward_std": 0.18779680132865906, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.45763610303401947, "step": 149 }, { "completion_length": 23.58333396911621, "epoch": 2.384, "grad_norm": 1.5094768705414543, "kl": 0.3095703125, "learning_rate": 4.1035205490778496e-07, "loss": 0.0003, "reward": 0.8031023442745209, "reward_std": 0.2327413372695446, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.44893570244312286, "step": 150 }, { "completion_length": 35.125, "epoch": 2.4, "grad_norm": 2.0077887373974734, "kl": 0.19970703125, "learning_rate": 4.09106313395482e-07, "loss": 0.0002, "reward": 0.7374315559864044, "reward_std": 0.3449878916144371, "rewards/correct_code_reward_func": 0.3125000111758709, "rewards/len_reward_func": 0.42493152618408203, "step": 151 }, { "completion_length": 27.979167938232422, "epoch": 2.416, "grad_norm": 1.6931078080658426, "kl": 0.30908203125, "learning_rate": 4.078538961009268e-07, "loss": 0.0003, "reward": 0.7056878507137299, "reward_std": 0.2828039154410362, "rewards/correct_code_reward_func": 0.25000000558793545, "rewards/len_reward_func": 0.45568785071372986, "step": 152 }, { "completion_length": 26.041667938232422, "epoch": 2.432, "grad_norm": 3.4873197344938505, "kl": 0.234375, "learning_rate": 4.0659485557304047e-07, "loss": 0.0002, "reward": 0.8596743643283844, "reward_std": 0.3951665312051773, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.4846743494272232, "step": 153 }, { "completion_length": 25.416667938232422, "epoch": 2.448, "grad_norm": 0.5832761487278126, "kl": 0.28125, "learning_rate": 4.0532924463864214e-07, "loss": 0.0003, "reward": 0.875, "reward_std": 0.16623876243829727, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 154 }, { "completion_length": 33.75000190734863, "epoch": 2.464, "grad_norm": 3.3108288489196216, "kl": 0.24169921875, "learning_rate": 4.040571164002318e-07, "loss": 0.0002, "reward": 0.8077309429645538, "reward_std": 0.4106632024049759, "rewards/correct_code_reward_func": 0.3333333544433117, "rewards/len_reward_func": 0.47439758479595184, "step": 155 }, { "completion_length": 28.02083396911621, "epoch": 2.48, "grad_norm": 1.642835238916714, "kl": 0.23388671875, "learning_rate": 4.027785242337625e-07, "loss": 0.0002, "reward": 0.7023809850215912, "reward_std": 0.30807141959667206, "rewards/correct_code_reward_func": 0.229166679084301, "rewards/len_reward_func": 0.4732142984867096, "step": 156 }, { "completion_length": 23.375, "epoch": 2.496, "grad_norm": 2.10881607764319, "kl": 0.29638671875, "learning_rate": 4.0149352178640084e-07, "loss": 0.0003, "reward": 0.7916666865348816, "reward_std": 0.2994871214032173, "rewards/correct_code_reward_func": 0.2916666679084301, "rewards/len_reward_func": 0.5, "step": 157 }, { "completion_length": 39.437500953674316, "epoch": 2.512, "grad_norm": 1.494083480221189, "kl": 0.24658203125, "learning_rate": 4.002021629742759e-07, "loss": 0.0002, "reward": 0.6607388556003571, "reward_std": 0.2653798274695873, "rewards/correct_code_reward_func": 0.2083333395421505, "rewards/len_reward_func": 0.45240549743175507, "step": 158 }, { "completion_length": 24.83333396911621, "epoch": 2.528, "grad_norm": 4.504000272348399, "kl": 0.30859375, "learning_rate": 3.9890450198021705e-07, "loss": 0.0003, "reward": 0.8541666865348816, "reward_std": 0.36753228306770325, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.5, "step": 159 }, { "completion_length": 26.45833396911621, "epoch": 2.544, "grad_norm": 4.070154440728412, "kl": 0.3583984375, "learning_rate": 3.9760059325148063e-07, "loss": 0.0004, "reward": 1.0000000298023224, "reward_std": 0.15430335700511932, "rewards/correct_code_reward_func": 0.5000000298023224, "rewards/len_reward_func": 0.5, "step": 160 }, { "completion_length": 31.791667938232422, "epoch": 2.56, "grad_norm": 1.213750670059646, "kl": 0.24755859375, "learning_rate": 3.9629049149746556e-07, "loss": 0.0002, "reward": 0.8541666865348816, "reward_std": 0.3857453167438507, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.5, "step": 161 }, { "completion_length": 35.500000953674316, "epoch": 2.576, "grad_norm": 4.880537766814768, "kl": 0.2998046875, "learning_rate": 3.949742516874175e-07, "loss": 0.0003, "reward": 0.75, "reward_std": 0.2840898931026459, "rewards/correct_code_reward_func": 0.2708333358168602, "rewards/len_reward_func": 0.4791666716337204, "step": 162 }, { "completion_length": 32.81250190734863, "epoch": 2.592, "grad_norm": 0.6480053728324462, "kl": 0.2451171875, "learning_rate": 3.9365192904812263e-07, "loss": 0.0002, "reward": 0.7500000298023224, "reward_std": 0.15430335700511932, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.5, "step": 163 }, { "completion_length": 21.625000953674316, "epoch": 2.608, "grad_norm": 5.168623364947485, "kl": 0.3251953125, "learning_rate": 3.9232357906159065e-07, "loss": 0.0003, "reward": 0.9677360653877258, "reward_std": 0.36441104114055634, "rewards/correct_code_reward_func": 0.4791666716337204, "rewards/len_reward_func": 0.48856931924819946, "step": 164 }, { "completion_length": 27.229166984558105, "epoch": 2.624, "grad_norm": 1.7162957930860199, "kl": 0.20654296875, "learning_rate": 3.909892574627266e-07, "loss": 0.0002, "reward": 0.7291666865348816, "reward_std": 0.1480126492679119, "rewards/correct_code_reward_func": 0.229166679084301, "rewards/len_reward_func": 0.5, "step": 165 }, { "completion_length": 26.562500953674316, "epoch": 2.64, "grad_norm": 7.012645050314315, "kl": 0.265625, "learning_rate": 3.8964902023699234e-07, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.21322893351316452, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.5, "step": 166 }, { "completion_length": 33.4375, "epoch": 2.656, "grad_norm": 1.4618104253449193, "kl": 0.29443359375, "learning_rate": 3.8830292361805767e-07, "loss": 0.0003, "reward": 0.8750000596046448, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 167 }, { "completion_length": 50.333335876464844, "epoch": 2.672, "grad_norm": 1.1548968690502175, "kl": 0.3291015625, "learning_rate": 3.869510240854407e-07, "loss": 0.0003, "reward": 0.7708333432674408, "reward_std": 0.13607724010944366, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.4791666716337204, "step": 168 }, { "completion_length": 32.77083492279053, "epoch": 2.6879999999999997, "grad_norm": 1.7708211120173893, "kl": 0.22509765625, "learning_rate": 3.855933783621383e-07, "loss": 0.0002, "reward": 0.5873316824436188, "reward_std": 0.34993261098861694, "rewards/correct_code_reward_func": 0.1250000037252903, "rewards/len_reward_func": 0.4623316675424576, "step": 169 }, { "completion_length": 37.14583396911621, "epoch": 2.7039999999999997, "grad_norm": 0.8273511612323733, "kl": 0.24169921875, "learning_rate": 3.8423004341224595e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.625, "rewards/len_reward_func": 0.5, "step": 170 }, { "completion_length": 25.854167938232422, "epoch": 2.7199999999999998, "grad_norm": 4.844327377758548, "kl": 0.296875, "learning_rate": 3.828610764385676e-07, "loss": 0.0003, "reward": 0.8125000298023224, "reward_std": 0.3205290399491787, "rewards/correct_code_reward_func": 0.31250002048909664, "rewards/len_reward_func": 0.5, "step": 171 }, { "completion_length": 37.58333492279053, "epoch": 2.7359999999999998, "grad_norm": 0.85750009835101, "kl": 0.22412109375, "learning_rate": 3.8148653488021566e-07, "loss": 0.0002, "reward": 0.8333333730697632, "reward_std": 0.17251639068126678, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 172 }, { "completion_length": 19.39583396911621, "epoch": 2.752, "grad_norm": 4.546104049005407, "kl": 0.388671875, "learning_rate": 3.801064764102011e-07, "loss": 0.0004, "reward": 0.9583333730697632, "reward_std": 0.3268197476863861, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.5, "step": 173 }, { "completion_length": 21.041666984558105, "epoch": 2.768, "grad_norm": 2.488704592885805, "kl": 0.328125, "learning_rate": 3.787209589330134e-07, "loss": 0.0003, "reward": 0.8333333432674408, "reward_std": 0.2903675436973572, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 174 }, { "completion_length": 25.750000953674316, "epoch": 2.784, "grad_norm": 3.4706174977001085, "kl": 0.3203125, "learning_rate": 3.773300405821908e-07, "loss": 0.0003, "reward": 0.625, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.5, "step": 175 }, { "completion_length": 22.125000953674316, "epoch": 2.8, "grad_norm": 6.035407454671845, "kl": 0.333984375, "learning_rate": 3.759337797178816e-07, "loss": 0.0003, "reward": 0.7083333730697632, "reward_std": 0.24966806918382645, "rewards/correct_code_reward_func": 0.2083333395421505, "rewards/len_reward_func": 0.5, "step": 176 }, { "completion_length": 26.854166984558105, "epoch": 2.816, "grad_norm": 0.8948891454908713, "kl": 0.4052734375, "learning_rate": 3.745322349243954e-07, "loss": 0.0004, "reward": 0.7708333730697632, "reward_std": 0.14801263809204102, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 177 }, { "completion_length": 43.39583396911621, "epoch": 2.832, "grad_norm": 1.4563433011031992, "kl": 0.215087890625, "learning_rate": 3.7312546500774455e-07, "loss": 0.0002, "reward": 0.8541666865348816, "reward_std": 0.30859364569187164, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.5, "step": 178 }, { "completion_length": 17.562500953674316, "epoch": 2.848, "grad_norm": 6.496732643288798, "kl": 0.3759765625, "learning_rate": 3.717135289931774e-07, "loss": 0.0004, "reward": 0.8333333432674408, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 179 }, { "completion_length": 43.83333396911621, "epoch": 2.864, "grad_norm": 0.862953352923827, "kl": 0.63671875, "learning_rate": 3.7029648612270123e-07, "loss": 0.0006, "reward": 0.7500000298023224, "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.2500000149011612, "rewards/len_reward_func": 0.5, "step": 180 }, { "completion_length": 44.35416793823242, "epoch": 2.88, "grad_norm": 1.1006354405018577, "kl": 0.19970703125, "learning_rate": 3.688743958525969e-07, "loss": 0.0002, "reward": 0.7916666865348816, "reward_std": 0.2840898931026459, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 181 }, { "completion_length": 30.02083396911621, "epoch": 2.896, "grad_norm": 0.43741770998780366, "kl": 0.3212890625, "learning_rate": 3.6744731785092393e-07, "loss": 0.0003, "reward": 0.7500000298023224, "reward_std": 0.08908708393573761, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.5, "step": 182 }, { "completion_length": 39.291666984558105, "epoch": 2.912, "grad_norm": 1.0212558082151113, "kl": 0.263671875, "learning_rate": 3.660153119950171e-07, "loss": 0.0003, "reward": 0.7708333730697632, "reward_std": 0.14801263809204102, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 183 }, { "completion_length": 32.14583492279053, "epoch": 2.928, "grad_norm": 1.0645346014541865, "kl": 0.28955078125, "learning_rate": 3.6457843836897417e-07, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.3630879074335098, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 184 }, { "completion_length": 27.70833396911621, "epoch": 2.944, "grad_norm": 4.902484194089981, "kl": 0.3564453125, "learning_rate": 3.6313675726113475e-07, "loss": 0.0004, "reward": 0.979166716337204, "reward_std": 0.33592626452445984, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.5, "step": 185 }, { "completion_length": 38.5, "epoch": 2.96, "grad_norm": 2.7460144649848104, "kl": 0.2890625, "learning_rate": 3.6169032916155055e-07, "loss": 0.0003, "reward": 0.7708333432674408, "reward_std": 0.24056155234575272, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 186 }, { "completion_length": 30.375001907348633, "epoch": 2.976, "grad_norm": 1.5868616221089409, "kl": 0.4033203125, "learning_rate": 3.602392147594479e-07, "loss": 0.0004, "reward": 0.9375000298023224, "reward_std": 0.30231601744890213, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.5, "step": 187 }, { "completion_length": 31.979167938232422, "epoch": 2.992, "grad_norm": 1.6492447606938656, "kl": 0.765625, "learning_rate": 3.587834749406808e-07, "loss": 0.0008, "reward": 0.791666716337204, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.29166667722165585, "rewards/len_reward_func": 0.5, "step": 188 }, { "completion_length": 34.583335876464844, "epoch": 3.0, "grad_norm": 1.6492447606938656, "kl": 0.353515625, "learning_rate": 3.573231707851765e-07, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.0, "rewards/len_reward_func": 0.5, "step": 189 }, { "completion_length": 25.437501907348633, "epoch": 3.016, "grad_norm": 0.6202012399572476, "kl": 0.328125, "learning_rate": 3.558583635643726e-07, "loss": 0.0003, "reward": 0.7916666865348816, "reward_std": 0.2342708557844162, "rewards/correct_code_reward_func": 0.291666679084301, "rewards/len_reward_func": 0.5, "step": 190 }, { "completion_length": 45.16666793823242, "epoch": 3.032, "grad_norm": 3.268492949679133, "kl": 0.30029296875, "learning_rate": 3.543891147386463e-07, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.22516433894634247, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 191 }, { "completion_length": 24.45833396911621, "epoch": 3.048, "grad_norm": 5.775551595741029, "kl": 0.31640625, "learning_rate": 3.52915485954736e-07, "loss": 0.0003, "reward": 1.125, "reward_std": 0.2994871102273464, "rewards/correct_code_reward_func": 0.625, "rewards/len_reward_func": 0.5, "step": 192 }, { "completion_length": 33.77083396911621, "epoch": 3.064, "grad_norm": 1.32331402024035, "kl": 11.91748046875, "learning_rate": 3.514375390431539e-07, "loss": 0.012, "reward": 0.7708333730697632, "reward_std": 0.16340987384319305, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 193 }, { "completion_length": 25.041667938232422, "epoch": 3.08, "grad_norm": 15.903056378391673, "kl": 0.3701171875, "learning_rate": 3.4995533601559225e-07, "loss": 0.0004, "reward": 1.1250000596046448, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.6250000149011612, "rewards/len_reward_func": 0.5, "step": 194 }, { "completion_length": 32.10416793823242, "epoch": 3.096, "grad_norm": 0.8594817829407627, "kl": 1.7802734375, "learning_rate": 3.484689390623218e-07, "loss": 0.0018, "reward": 0.6875000298023224, "reward_std": 0.1767766959965229, "rewards/correct_code_reward_func": 0.1875000111758709, "rewards/len_reward_func": 0.5, "step": 195 }, { "completion_length": 16.645833492279053, "epoch": 3.112, "grad_norm": 2.729844658975061, "kl": 0.443359375, "learning_rate": 3.469784105495816e-07, "loss": 0.0004, "reward": 1.2708333730697632, "reward_std": 0.1767766959965229, "rewards/correct_code_reward_func": 0.7708333432674408, "rewards/len_reward_func": 0.5, "step": 196 }, { "completion_length": 42.00000286102295, "epoch": 3.128, "grad_norm": 2.9296646774191917, "kl": 0.53515625, "learning_rate": 3.4548381301696295e-07, "loss": 0.0005, "reward": 0.916666716337204, "reward_std": 0.3794546127319336, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.5, "step": 197 }, { "completion_length": 45.5, "epoch": 3.144, "grad_norm": 1.7226032460824214, "kl": 0.4296875, "learning_rate": 3.4398520917478476e-07, "loss": 0.0004, "reward": 0.8958333730697632, "reward_std": 0.28126100450754166, "rewards/correct_code_reward_func": 0.395833358168602, "rewards/len_reward_func": 0.5, "step": 198 }, { "completion_length": 33.79166793823242, "epoch": 3.16, "grad_norm": 1.0268275604382027, "kl": 0.3486328125, "learning_rate": 3.42482661901463e-07, "loss": 0.0003, "reward": 0.7291666865348816, "reward_std": 0.22516431659460068, "rewards/correct_code_reward_func": 0.2500000111758709, "rewards/len_reward_func": 0.4791666716337204, "step": 199 }, { "completion_length": 35.43750190734863, "epoch": 3.176, "grad_norm": 1.0861549087201527, "kl": 0.287109375, "learning_rate": 3.409762342408719e-07, "loss": 0.0003, "reward": 0.7500000298023224, "reward_std": 0.19500280916690826, "rewards/correct_code_reward_func": 0.2500000111758709, "rewards/len_reward_func": 0.5, "step": 200 }, { "completion_length": 55.479166984558105, "epoch": 3.192, "grad_norm": 1.5283228572290333, "kl": 0.375, "learning_rate": 3.3946598939969893e-07, "loss": 0.0004, "reward": 0.8333333730697632, "reward_std": 0.15430335700511932, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 201 }, { "completion_length": 23.9375, "epoch": 3.208, "grad_norm": 1.2041133220324134, "kl": 0.4013671875, "learning_rate": 3.379519907447931e-07, "loss": 0.0004, "reward": 1.0, "reward_std": 0.24339044094085693, "rewards/correct_code_reward_func": 0.5, "rewards/len_reward_func": 0.5, "step": 202 }, { "completion_length": 41.70833396911621, "epoch": 3.224, "grad_norm": 1.0540529518722053, "kl": 0.2734375, "learning_rate": 3.364343018005057e-07, "loss": 0.0003, "reward": 0.7291666865348816, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.2291666716337204, "rewards/len_reward_func": 0.5, "step": 203 }, { "completion_length": 22.187500953674316, "epoch": 3.24, "grad_norm": 7.138753075856611, "kl": 0.3388671875, "learning_rate": 3.349129862460251e-07, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.25249695032835007, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 204 }, { "completion_length": 22.562500953674316, "epoch": 3.2560000000000002, "grad_norm": 2.174154407115016, "kl": 0.3720703125, "learning_rate": 3.3338810791270517e-07, "loss": 0.0004, "reward": 0.8958333432674408, "reward_std": 0.3205290399491787, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 205 }, { "completion_length": 26.770834922790527, "epoch": 3.2720000000000002, "grad_norm": 1.5444039155167695, "kl": 0.490234375, "learning_rate": 3.318597307813866e-07, "loss": 0.0005, "reward": 0.7916666865348816, "reward_std": 0.2342708334326744, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 206 }, { "completion_length": 28.000000953674316, "epoch": 3.288, "grad_norm": 4.365687287010453, "kl": 0.349609375, "learning_rate": 3.3032791897971307e-07, "loss": 0.0003, "reward": 0.7916666865348816, "reward_std": 0.2630349025130272, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 207 }, { "completion_length": 37.14583396911621, "epoch": 3.304, "grad_norm": 3.197536110678343, "kl": 0.283203125, "learning_rate": 3.287927367794397e-07, "loss": 0.0003, "reward": 0.7973356544971466, "reward_std": 0.21966809779405594, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.484835609793663, "step": 208 }, { "completion_length": 32.35416793823242, "epoch": 3.32, "grad_norm": 3.3805334494300956, "kl": 0.619140625, "learning_rate": 3.272542485937368e-07, "loss": 0.0006, "reward": 0.7500000298023224, "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.25000000558793545, "rewards/len_reward_func": 0.5, "step": 209 }, { "completion_length": 22.4375, "epoch": 3.336, "grad_norm": 1.6447820848367736, "kl": 1.8876953125, "learning_rate": 3.2571251897448763e-07, "loss": 0.0019, "reward": 1.0416666865348816, "reward_std": 0.2994871288537979, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 210 }, { "completion_length": 27.5625, "epoch": 3.352, "grad_norm": 3.389054471629676, "kl": 0.3701171875, "learning_rate": 3.241676126095792e-07, "loss": 0.0004, "reward": 1.0, "reward_std": 0.42927365005016327, "rewards/correct_code_reward_func": 0.5, "rewards/len_reward_func": 0.5, "step": 211 }, { "completion_length": 24.916666984558105, "epoch": 3.368, "grad_norm": 12.368626256137578, "kl": 0.3408203125, "learning_rate": 3.226195943201883e-07, "loss": 0.0003, "reward": 0.8541666865348816, "reward_std": 0.204109326004982, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.5, "step": 212 }, { "completion_length": 25.58333396911621, "epoch": 3.384, "grad_norm": 1.1298161238486502, "kl": 0.416015625, "learning_rate": 3.2106852905806216e-07, "loss": 0.0004, "reward": 0.9375000596046448, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.5, "step": 213 }, { "completion_length": 30.104167938232422, "epoch": 3.4, "grad_norm": 10.260020475970132, "kl": 0.408203125, "learning_rate": 3.1951448190279253e-07, "loss": 0.0004, "reward": 0.9583333432674408, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 214 }, { "completion_length": 29.791666984558105, "epoch": 3.416, "grad_norm": 3.3670910458604015, "kl": 0.37841796875, "learning_rate": 3.179575180590857e-07, "loss": 0.0004, "reward": 0.7916666865348816, "reward_std": 0.2840898931026459, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 215 }, { "completion_length": 22.916667938232422, "epoch": 3.432, "grad_norm": 8.332848830151702, "kl": 0.3994140625, "learning_rate": 3.163977028540263e-07, "loss": 0.0004, "reward": 1.041666716337204, "reward_std": 0.2630349025130272, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 216 }, { "completion_length": 28.854167938232422, "epoch": 3.448, "grad_norm": 1.29269474567732, "kl": 0.41015625, "learning_rate": 3.1483510173433627e-07, "loss": 0.0004, "reward": 0.9166666865348816, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.5, "step": 217 }, { "completion_length": 55.583335876464844, "epoch": 3.464, "grad_norm": 1.1956536549620946, "kl": 1.72021484375, "learning_rate": 3.1326978026362905e-07, "loss": 0.0017, "reward": 0.5416666865348816, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.06250000186264515, "rewards/len_reward_func": 0.4791666716337204, "step": 218 }, { "completion_length": 32.66666793823242, "epoch": 3.48, "grad_norm": 7.474594998118115, "kl": 1.87890625, "learning_rate": 3.1170180411965854e-07, "loss": 0.0019, "reward": 0.6458333432674408, "reward_std": 0.13607725501060486, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.5, "step": 219 }, { "completion_length": 33.93750190734863, "epoch": 3.496, "grad_norm": 0.14003358731788035, "kl": 0.5478515625, "learning_rate": 3.101312390915634e-07, "loss": 0.0005, "reward": 0.916666716337204, "reward_std": 0.15430335700511932, "rewards/correct_code_reward_func": 0.416666679084301, "rewards/len_reward_func": 0.5, "step": 220 }, { "completion_length": 46.0625, "epoch": 3.512, "grad_norm": 3.45663103404313, "kl": 0.2705078125, "learning_rate": 3.0855815107710665e-07, "loss": 0.0003, "reward": 0.8541666865348816, "reward_std": 0.28126102685928345, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.5, "step": 221 }, { "completion_length": 26.375000953674316, "epoch": 3.528, "grad_norm": 1.4588393763010847, "kl": 0.861328125, "learning_rate": 3.069826060799109e-07, "loss": 0.0009, "reward": 0.7708333432674408, "reward_std": 0.28126101940870285, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 222 }, { "completion_length": 26.479167938232422, "epoch": 3.544, "grad_norm": 3.8929614164173505, "kl": 0.798828125, "learning_rate": 3.054046702066886e-07, "loss": 0.0008, "reward": 0.8750000596046448, "reward_std": 0.19500280916690826, "rewards/correct_code_reward_func": 0.3750000111758709, "rewards/len_reward_func": 0.5, "step": 223 }, { "completion_length": 37.75000190734863, "epoch": 3.56, "grad_norm": 1.3984016713523089, "kl": 0.29345703125, "learning_rate": 3.038244096644687e-07, "loss": 0.0003, "reward": 0.7500000298023224, "reward_std": 0.15430335700511932, "rewards/correct_code_reward_func": 0.2500000111758709, "rewards/len_reward_func": 0.5, "step": 224 }, { "completion_length": 23.541666984558105, "epoch": 3.576, "grad_norm": 1.132022810723016, "kl": 0.4326171875, "learning_rate": 3.022418907578188e-07, "loss": 0.0004, "reward": 0.8958333432674408, "reward_std": 0.264432355761528, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 225 }, { "completion_length": 27.666666984558105, "epoch": 3.592, "grad_norm": 0.9291842503741052, "kl": 0.4638671875, "learning_rate": 3.0065717988606256e-07, "loss": 0.0005, "reward": 0.7500000298023224, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.2500000149011612, "rewards/len_reward_func": 0.5, "step": 226 }, { "completion_length": 37.0625, "epoch": 3.608, "grad_norm": 2.994489684005205, "kl": 0.58984375, "learning_rate": 2.990703435404944e-07, "loss": 0.0006, "reward": 1.1458333730697632, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.6458333432674408, "rewards/len_reward_func": 0.5, "step": 227 }, { "completion_length": 17.479166984558105, "epoch": 3.624, "grad_norm": 25.48314385427609, "kl": 5.3251953125, "learning_rate": 2.974814483015892e-07, "loss": 0.0053, "reward": 1.2291666865348816, "reward_std": 0.3857453167438507, "rewards/correct_code_reward_func": 0.7291666865348816, "rewards/len_reward_func": 0.5, "step": 228 }, { "completion_length": 25.64583396911621, "epoch": 3.64, "grad_norm": 5.331589909011615, "kl": 0.4599609375, "learning_rate": 2.95890560836209e-07, "loss": 0.0005, "reward": 0.8333333730697632, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.3333333544433117, "rewards/len_reward_func": 0.5, "step": 229 }, { "completion_length": 39.27083396911621, "epoch": 3.656, "grad_norm": 0.9614447916816511, "kl": 0.29052734375, "learning_rate": 2.942977478948057e-07, "loss": 0.0003, "reward": 1.0208333730697632, "reward_std": 0.22516431659460068, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.4791666716337204, "step": 230 }, { "completion_length": 37.62500190734863, "epoch": 3.672, "grad_norm": 1.6189285355243652, "kl": 82.1376953125, "learning_rate": 2.9270307630862006e-07, "loss": 0.0816, "reward": 0.875, "reward_std": 0.4159068316221237, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 231 }, { "completion_length": 37.64583396911621, "epoch": 3.6879999999999997, "grad_norm": 144.3039978026888, "kl": 0.255859375, "learning_rate": 2.911066129868782e-07, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1480126492679119, "rewards/correct_code_reward_func": 0.5625, "rewards/len_reward_func": 0.5, "step": 232 }, { "completion_length": 28.916667938232422, "epoch": 3.7039999999999997, "grad_norm": 3.7202345670411265, "kl": 0.3232421875, "learning_rate": 2.8950842491398355e-07, "loss": 0.0003, "reward": 0.6250000298023224, "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.12500000558793545, "rewards/len_reward_func": 0.5, "step": 233 }, { "completion_length": 46.83333492279053, "epoch": 3.7199999999999998, "grad_norm": 1.2743344086109105, "kl": 0.2841796875, "learning_rate": 2.87908579146707e-07, "loss": 0.0003, "reward": 0.7083333730697632, "reward_std": 0.19500280916690826, "rewards/correct_code_reward_func": 0.2083333395421505, "rewards/len_reward_func": 0.5, "step": 234 }, { "completion_length": 27.25, "epoch": 3.7359999999999998, "grad_norm": 10.188313773698406, "kl": 8.3095703125, "learning_rate": 2.863071428113726e-07, "loss": 0.0083, "reward": 1.0833333432674408, "reward_std": 0.08908708393573761, "rewards/correct_code_reward_func": 0.5833333358168602, "rewards/len_reward_func": 0.5, "step": 235 }, { "completion_length": 32.62500190734863, "epoch": 3.752, "grad_norm": 4.109218918644723, "kl": 1.931640625, "learning_rate": 2.847041831010417e-07, "loss": 0.0019, "reward": 0.8233599662780762, "reward_std": 0.18251240625977516, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.490026593208313, "step": 236 }, { "completion_length": 31.083334922790527, "epoch": 3.768, "grad_norm": 1.8606521647165692, "kl": 0.73828125, "learning_rate": 2.830997672726933e-07, "loss": 0.0007, "reward": 1.1041666865348816, "reward_std": 0.30859362706542015, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 237 }, { "completion_length": 55.354169845581055, "epoch": 3.784, "grad_norm": 2.734615110690433, "kl": 0.24951171875, "learning_rate": 2.8149396264440227e-07, "loss": 0.0003, "reward": 0.5, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.0, "rewards/len_reward_func": 0.5, "step": 238 }, { "completion_length": 41.291666984558105, "epoch": 3.8, "grad_norm": 0.012449679074169549, "kl": 0.2978515625, "learning_rate": 2.798868365925147e-07, "loss": 0.0003, "reward": 0.5, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.0, "rewards/len_reward_func": 0.5, "step": 239 }, { "completion_length": 36.77083492279053, "epoch": 3.816, "grad_norm": 7.797923174364327, "kl": 0.3642578125, "learning_rate": 2.782784565488211e-07, "loss": 0.0004, "reward": 0.7975983917713165, "reward_std": 0.10107371583580971, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.48509839177131653, "step": 240 }, { "completion_length": 28.375000953674316, "epoch": 3.832, "grad_norm": 2.477116298025544, "kl": 0.24755859375, "learning_rate": 2.7666888999772656e-07, "loss": 0.0002, "reward": 0.9375000596046448, "reward_std": 0.2587745860219002, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.5, "step": 241 }, { "completion_length": 30.416666984558105, "epoch": 3.848, "grad_norm": 0.9811120055537635, "kl": 9.095703125, "learning_rate": 2.7505820447342024e-07, "loss": 0.0091, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.5, "step": 242 }, { "completion_length": 19.39583396911621, "epoch": 3.864, "grad_norm": 15.396866385585833, "kl": 1.05078125, "learning_rate": 2.7344646755704073e-07, "loss": 0.001, "reward": 1.1875000596046448, "reward_std": 0.3219604417681694, "rewards/correct_code_reward_func": 0.6875000298023224, "rewards/len_reward_func": 0.5, "step": 243 }, { "completion_length": 23.041667461395264, "epoch": 3.88, "grad_norm": 12.447948337219644, "kl": 0.421875, "learning_rate": 2.7183374687384096e-07, "loss": 0.0004, "reward": 1.0000000596046448, "reward_std": 0.19500280916690826, "rewards/correct_code_reward_func": 0.5000000111758709, "rewards/len_reward_func": 0.5, "step": 244 }, { "completion_length": 43.3125, "epoch": 3.896, "grad_norm": 1.581341578871288, "kl": 0.2802734375, "learning_rate": 2.7022011009035107e-07, "loss": 0.0003, "reward": 0.8125000298023224, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.5, "step": 245 }, { "completion_length": 34.208335876464844, "epoch": 3.912, "grad_norm": 0.9152208180228742, "kl": 2.37890625, "learning_rate": 2.686056249115385e-07, "loss": 0.0024, "reward": 0.9791666865348816, "reward_std": 0.4748324006795883, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.5, "step": 246 }, { "completion_length": 35.83333492279053, "epoch": 3.928, "grad_norm": 1.5451820654194635, "kl": 0.513671875, "learning_rate": 2.669903590779679e-07, "loss": 0.0005, "reward": 0.7083333730697632, "reward_std": 0.19500280916690826, "rewards/correct_code_reward_func": 0.2291666679084301, "rewards/len_reward_func": 0.4791666716337204, "step": 247 }, { "completion_length": 23.625000953674316, "epoch": 3.944, "grad_norm": 3.4351450173628177, "kl": 0.419921875, "learning_rate": 2.653743803629587e-07, "loss": 0.0004, "reward": 1.1250000298023224, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 248 }, { "completion_length": 22.916666984558105, "epoch": 3.96, "grad_norm": 4.124748952346768, "kl": 0.484375, "learning_rate": 2.637577565697412e-07, "loss": 0.0005, "reward": 0.9583333432674408, "reward_std": 0.2994871288537979, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 249 }, { "completion_length": 26.437500953674316, "epoch": 3.976, "grad_norm": 7.576926136134958, "kl": 0.8662109375, "learning_rate": 2.621405555286121e-07, "loss": 0.0009, "reward": 1.0833333730697632, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.583333358168602, "rewards/len_reward_func": 0.5, "step": 250 }, { "completion_length": 24.89583396911621, "epoch": 3.992, "grad_norm": 19.398267903289618, "kl": 0.6259765625, "learning_rate": 2.60522845094088e-07, "loss": 0.0006, "reward": 1.1458333730697632, "reward_std": 0.1767766959965229, "rewards/correct_code_reward_func": 0.6458333432674408, "rewards/len_reward_func": 0.5, "step": 251 }, { "completion_length": 28.625, "epoch": 4.0, "grad_norm": 4.9899585795469665, "kl": 0.79296875, "learning_rate": 2.589046931420589e-07, "loss": 0.0004, "reward": 0.8333333730697632, "reward_std": 0.34503278136253357, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 252 }, { "completion_length": 28.89583396911621, "epoch": 4.016, "grad_norm": 3.4544081031205565, "kl": 0.3349609375, "learning_rate": 2.572861675669399e-07, "loss": 0.0003, "reward": 0.7500000298023224, "reward_std": 0.24966806918382645, "rewards/correct_code_reward_func": 0.2500000111758709, "rewards/len_reward_func": 0.5, "step": 253 }, { "completion_length": 27.354167938232422, "epoch": 4.032, "grad_norm": 5.360212409444759, "kl": 8.87109375, "learning_rate": 2.556673362788225e-07, "loss": 0.0089, "reward": 0.8750000298023224, "reward_std": 0.2721545025706291, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 254 }, { "completion_length": 56.958335876464844, "epoch": 4.048, "grad_norm": 4.337364120588155, "kl": 0.26171875, "learning_rate": 2.540482672006254e-07, "loss": 0.0003, "reward": 0.8750000298023224, "reward_std": 0.2553258389234543, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 255 }, { "completion_length": 38.14583492279053, "epoch": 4.064, "grad_norm": 5.937461009037335, "kl": 0.2958984375, "learning_rate": 2.524290282652443e-07, "loss": 0.0003, "reward": 0.9583333432674408, "reward_std": 0.2357022576034069, "rewards/correct_code_reward_func": 0.4791666716337204, "rewards/len_reward_func": 0.4791666716337204, "step": 256 }, { "completion_length": 40.70833396911621, "epoch": 4.08, "grad_norm": 6.960976871582724, "kl": 0.28515625, "learning_rate": 2.508096874127022e-07, "loss": 0.0003, "reward": 1.0416666865348816, "reward_std": 0.3177001625299454, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 257 }, { "completion_length": 22.58333396911621, "epoch": 4.096, "grad_norm": 3.64669977435422, "kl": 0.2900390625, "learning_rate": 2.4919031258729785e-07, "loss": 0.0003, "reward": 1.1250000596046448, "reward_std": 0.2314550280570984, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 258 }, { "completion_length": 34.83333492279053, "epoch": 4.112, "grad_norm": 644.251002590668, "kl": 290.12451171875, "learning_rate": 2.475709717347557e-07, "loss": 0.2906, "reward": 0.7708333432674408, "reward_std": 0.2041093371808529, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 259 }, { "completion_length": 26.83333396911621, "epoch": 4.128, "grad_norm": 11.440118730433715, "kl": 1.41796875, "learning_rate": 2.459517327993746e-07, "loss": 0.0014, "reward": 0.9375000298023224, "reward_std": 0.28126100450754166, "rewards/correct_code_reward_func": 0.4375000298023224, "rewards/len_reward_func": 0.5, "step": 260 }, { "completion_length": 29.520834922790527, "epoch": 4.144, "grad_norm": 179.75006965593022, "kl": 104.625, "learning_rate": 2.443326637211775e-07, "loss": 0.1045, "reward": 1.1041667461395264, "reward_std": 0.22516432404518127, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 261 }, { "completion_length": 20.33333396911621, "epoch": 4.16, "grad_norm": 2.132132107066578, "kl": 0.3193359375, "learning_rate": 2.427138324330601e-07, "loss": 0.0003, "reward": 1.4583333730697632, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.9583333730697632, "rewards/len_reward_func": 0.5, "step": 262 }, { "completion_length": 29.812501430511475, "epoch": 4.176, "grad_norm": 157.34801225037154, "kl": 62.5693359375, "learning_rate": 2.4109530685794106e-07, "loss": 0.0626, "reward": 1.0833333730697632, "reward_std": 0.2630349025130272, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.4791666716337204, "step": 263 }, { "completion_length": 25.89583396911621, "epoch": 4.192, "grad_norm": 4.26239211159855, "kl": 0.5234375, "learning_rate": 2.3947715490591203e-07, "loss": 0.0005, "reward": 0.875, "reward_std": 0.36751921474933624, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 264 }, { "completion_length": 41.979169845581055, "epoch": 4.208, "grad_norm": 1.0123222669357814, "kl": 0.27734375, "learning_rate": 2.37859444471388e-07, "loss": 0.0003, "reward": 0.6458333432674408, "reward_std": 0.24056155234575272, "rewards/correct_code_reward_func": 0.1458333358168602, "rewards/len_reward_func": 0.5, "step": 265 }, { "completion_length": 27.854166984558105, "epoch": 4.224, "grad_norm": 117.90954129112552, "kl": 75.134765625, "learning_rate": 2.3624224343025876e-07, "loss": 0.0751, "reward": 0.7291666865348816, "reward_std": 0.16340987384319305, "rewards/correct_code_reward_func": 0.2291666716337204, "rewards/len_reward_func": 0.5, "step": 266 }, { "completion_length": 39.5625, "epoch": 4.24, "grad_norm": 5.1933183547730986, "kl": 1.02783203125, "learning_rate": 2.346256196370413e-07, "loss": 0.001, "reward": 0.6837384402751923, "reward_std": 0.34314342588186264, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.47540509700775146, "step": 267 }, { "completion_length": 92.64583587646484, "epoch": 4.256, "grad_norm": 0.9859859289357618, "kl": 0.79638671875, "learning_rate": 2.3300964092203203e-07, "loss": 0.0008, "reward": 0.5416666865348816, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.5, "step": 268 }, { "completion_length": 34.89583396911621, "epoch": 4.272, "grad_norm": 16.913050019636458, "kl": 16.21875, "learning_rate": 2.3139437508846152e-07, "loss": 0.0162, "reward": 0.5208333432674408, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.02083333395421505, "rewards/len_reward_func": 0.5, "step": 269 }, { "completion_length": 58.29166793823242, "epoch": 4.288, "grad_norm": 1.2524162935664223, "kl": 0.302734375, "learning_rate": 2.2977988990964896e-07, "loss": 0.0003, "reward": 0.7291666865348816, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.2291666716337204, "rewards/len_reward_func": 0.5, "step": 270 }, { "completion_length": 33.12500190734863, "epoch": 4.304, "grad_norm": 4.007391098481951, "kl": 0.32080078125, "learning_rate": 2.28166253126159e-07, "loss": 0.0003, "reward": 1.0833333730697632, "reward_std": 0.4173382371664047, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.5, "step": 271 }, { "completion_length": 54.770835876464844, "epoch": 4.32, "grad_norm": 1.765789657496171, "kl": 0.4287109375, "learning_rate": 2.2655353244295927e-07, "loss": 0.0004, "reward": 0.7500000298023224, "reward_std": 0.08908708393573761, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.5, "step": 272 }, { "completion_length": 46.833335876464844, "epoch": 4.336, "grad_norm": 3.0926129932371356, "kl": 0.35546875, "learning_rate": 2.2494179552657974e-07, "loss": 0.0004, "reward": 0.9128472208976746, "reward_std": 0.2177412360906601, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.49618056416511536, "step": 273 }, { "completion_length": 29.479166984558105, "epoch": 4.352, "grad_norm": 9.296614387501771, "kl": 0.21484375, "learning_rate": 2.233311100022734e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.31142252683639526, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 274 }, { "completion_length": 24.104166984558105, "epoch": 4.368, "grad_norm": 4.386312904473957, "kl": 0.396484375, "learning_rate": 2.2172154345117894e-07, "loss": 0.0004, "reward": 0.7291666865348816, "reward_std": 0.22516433894634247, "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.4791666716337204, "step": 275 }, { "completion_length": 31.229167938232422, "epoch": 4.384, "grad_norm": 9.14988110970696, "kl": 1.267578125, "learning_rate": 2.2011316340748528e-07, "loss": 0.0013, "reward": 0.8958333730697632, "reward_std": 0.33108004927635193, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 276 }, { "completion_length": 47.041666984558105, "epoch": 4.4, "grad_norm": 15.08131618344892, "kl": 4.38720703125, "learning_rate": 2.1850603735559776e-07, "loss": 0.0044, "reward": 0.9583333432674408, "reward_std": 0.2903675250709057, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 277 }, { "completion_length": 49.541666984558105, "epoch": 4.416, "grad_norm": 4.095080167229849, "kl": 0.8115234375, "learning_rate": 2.1690023272730678e-07, "loss": 0.0008, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 278 }, { "completion_length": 25.291666984558105, "epoch": 4.432, "grad_norm": 7.954449742072288, "kl": 0.2734375, "learning_rate": 2.1529581689895836e-07, "loss": 0.0003, "reward": 0.9166666865348816, "reward_std": 0.36751919239759445, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.5, "step": 279 }, { "completion_length": 31.604166984558105, "epoch": 4.448, "grad_norm": 29.8321255682335, "kl": 21.2265625, "learning_rate": 2.1369285718862748e-07, "loss": 0.0212, "reward": 0.6666666865348816, "reward_std": 0.17251639068126678, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.5, "step": 280 }, { "completion_length": 19.08333396911621, "epoch": 4.464, "grad_norm": 5.637716121004335, "kl": 1.09765625, "learning_rate": 2.1209142085329298e-07, "loss": 0.0011, "reward": 1.1666666865348816, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.6666666865348816, "rewards/len_reward_func": 0.5, "step": 281 }, { "completion_length": 34.37500190734863, "epoch": 4.48, "grad_norm": 4861.585798607741, "kl": 3440.1103515625, "learning_rate": 2.104915750860164e-07, "loss": 3.4448, "reward": 0.871611475944519, "reward_std": 0.3660082519054413, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.49661144614219666, "step": 282 }, { "completion_length": 27.1875, "epoch": 4.496, "grad_norm": 2.5633170180059732, "kl": 0.3515625, "learning_rate": 2.088933870131218e-07, "loss": 0.0004, "reward": 0.75, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.5, "step": 283 }, { "completion_length": 51.25000190734863, "epoch": 4.5120000000000005, "grad_norm": 2.416767668919635, "kl": 0.4931640625, "learning_rate": 2.072969236913799e-07, "loss": 0.0005, "reward": 0.8901910185813904, "reward_std": 0.1639716625213623, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.4943576455116272, "step": 284 }, { "completion_length": 19.625, "epoch": 4.5280000000000005, "grad_norm": 2.4304450279898893, "kl": 0.373046875, "learning_rate": 2.0570225210519433e-07, "loss": 0.0004, "reward": 0.791666716337204, "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.29166667722165585, "rewards/len_reward_func": 0.5, "step": 285 }, { "completion_length": 30.08333396911621, "epoch": 4.5440000000000005, "grad_norm": 6.167179936081702, "kl": 0.3251953125, "learning_rate": 2.0410943916379097e-07, "loss": 0.0003, "reward": 0.9166666865348816, "reward_std": 0.2903675436973572, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.5, "step": 286 }, { "completion_length": 18.354167461395264, "epoch": 4.5600000000000005, "grad_norm": 4.914270308521702, "kl": 1.1083984375, "learning_rate": 2.0251855169841075e-07, "loss": 0.0011, "reward": 1.0833333730697632, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.5, "step": 287 }, { "completion_length": 73.60416984558105, "epoch": 4.576, "grad_norm": 3.3424488078879273, "kl": 0.28955078125, "learning_rate": 2.0092965645950564e-07, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.13607725501060486, "rewards/correct_code_reward_func": 0.1875, "rewards/len_reward_func": 0.5, "step": 288 }, { "completion_length": 39.22916793823242, "epoch": 4.592, "grad_norm": 6.32426908664414, "kl": 0.28271484375, "learning_rate": 1.993428201139375e-07, "loss": 0.0003, "reward": 0.6666666865348816, "reward_std": 0.2342708557844162, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.5, "step": 289 }, { "completion_length": 22.479166984558105, "epoch": 4.608, "grad_norm": 38.222771661469466, "kl": 16.890625, "learning_rate": 1.977581092421812e-07, "loss": 0.0169, "reward": 0.8958333730697632, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 290 }, { "completion_length": 34.979166984558105, "epoch": 4.624, "grad_norm": 8.242066863632235, "kl": 0.9091796875, "learning_rate": 1.9617559033553126e-07, "loss": 0.0009, "reward": 0.8333333730697632, "reward_std": 0.2342708334326744, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 291 }, { "completion_length": 24.541667938232422, "epoch": 4.64, "grad_norm": 1.21691590374382, "kl": 0.23046875, "learning_rate": 1.9459532979331148e-07, "loss": 0.0002, "reward": 1.0625000596046448, "reward_std": 0.13607725501060486, "rewards/correct_code_reward_func": 0.5625000149011612, "rewards/len_reward_func": 0.5, "step": 292 }, { "completion_length": 25.27083396911621, "epoch": 4.656, "grad_norm": 8.643768890047063, "kl": 0.443359375, "learning_rate": 1.930173939200892e-07, "loss": 0.0004, "reward": 1.0625000596046448, "reward_std": 0.2931964099407196, "rewards/correct_code_reward_func": 0.5625000298023224, "rewards/len_reward_func": 0.5, "step": 293 }, { "completion_length": 35.64583396911621, "epoch": 4.672, "grad_norm": 2.743333904000121, "kl": 1.708984375, "learning_rate": 1.9144184892289336e-07, "loss": 0.0017, "reward": 0.8333333730697632, "reward_std": 0.17817416787147522, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 294 }, { "completion_length": 33.812500953674316, "epoch": 4.688, "grad_norm": 6.045139670180444, "kl": 0.34765625, "learning_rate": 1.8986876090843664e-07, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1480126492679119, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 295 }, { "completion_length": 61.95833492279053, "epoch": 4.704, "grad_norm": 3.6627974499384623, "kl": 0.453125, "learning_rate": 1.882981958803414e-07, "loss": 0.0005, "reward": 0.7690277099609375, "reward_std": 0.13556675985455513, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.4773610234260559, "step": 296 }, { "completion_length": 47.00000190734863, "epoch": 4.72, "grad_norm": 3.1790839878647574, "kl": 0.4951171875, "learning_rate": 1.8673021973637093e-07, "loss": 0.0005, "reward": 0.8541666865348816, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.5, "step": 297 }, { "completion_length": 34.85416793823242, "epoch": 4.736, "grad_norm": 1.8967683625125047, "kl": 0.4326171875, "learning_rate": 1.8516489826566374e-07, "loss": 0.0004, "reward": 1.1041666865348816, "reward_std": 0.23144195601344109, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 298 }, { "completion_length": 21.89583396911621, "epoch": 4.752, "grad_norm": 11.472906917031708, "kl": 0.3603515625, "learning_rate": 1.8360229714597368e-07, "loss": 0.0004, "reward": 1.2916666865348816, "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.7916666865348816, "rewards/len_reward_func": 0.5, "step": 299 }, { "completion_length": 47.47916793823242, "epoch": 4.768, "grad_norm": 18.376424918444233, "kl": 4.580078125, "learning_rate": 1.8204248194091425e-07, "loss": 0.0045, "reward": 0.7916666865348816, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 300 }, { "completion_length": 56.5, "epoch": 4.784, "grad_norm": 7.120288162523622, "kl": 0.31298828125, "learning_rate": 1.804855180972075e-07, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.23144196718931198, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 301 }, { "completion_length": 42.89583396911621, "epoch": 4.8, "grad_norm": 4.028419691961419, "kl": 1.111328125, "learning_rate": 1.7893147094193784e-07, "loss": 0.0011, "reward": 0.5833333432674408, "reward_std": 0.08908708393573761, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.5, "step": 302 }, { "completion_length": 39.95833396911621, "epoch": 4.816, "grad_norm": 1.911137619069032, "kl": 0.3046875, "learning_rate": 1.7738040567981165e-07, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.37034808099269867, "rewards/correct_code_reward_func": 0.3333333358168602, "rewards/len_reward_func": 0.4791666716337204, "step": 303 }, { "completion_length": 31.854167938232422, "epoch": 4.832, "grad_norm": 2.705599015258731, "kl": 2.70703125, "learning_rate": 1.7583238739042084e-07, "loss": 0.0027, "reward": 0.6041666865348816, "reward_std": 0.204109326004982, "rewards/correct_code_reward_func": 0.1041666679084301, "rewards/len_reward_func": 0.5, "step": 304 }, { "completion_length": 66.0416669845581, "epoch": 4.848, "grad_norm": 6.944354666378553, "kl": 1.524169921875, "learning_rate": 1.7428748102551234e-07, "loss": 0.0015, "reward": 0.7218064665794373, "reward_std": 0.2561583071947098, "rewards/correct_code_reward_func": 0.2291666679084301, "rewards/len_reward_func": 0.49263978004455566, "step": 305 }, { "completion_length": 31.39583396911621, "epoch": 4.864, "grad_norm": 0.6849820957521907, "kl": 0.2724609375, "learning_rate": 1.7274575140626315e-07, "loss": 0.0003, "reward": 1.1458333730697632, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.6458333432674408, "rewards/len_reward_func": 0.5, "step": 306 }, { "completion_length": 19.14583396911621, "epoch": 4.88, "grad_norm": 21.688892126211925, "kl": 35.15625, "learning_rate": 1.712072632205604e-07, "loss": 0.0351, "reward": 1.2083333730697632, "reward_std": 0.34018659591674805, "rewards/correct_code_reward_func": 0.7083333730697632, "rewards/len_reward_func": 0.5, "step": 307 }, { "completion_length": 50.04166793823242, "epoch": 4.896, "grad_norm": 0.829370422590923, "kl": 0.26708984375, "learning_rate": 1.6967208102028696e-07, "loss": 0.0003, "reward": 0.9525146782398224, "reward_std": 0.17141204327344894, "rewards/correct_code_reward_func": 0.4583333358168602, "rewards/len_reward_func": 0.4941813200712204, "step": 308 }, { "completion_length": 22.791667938232422, "epoch": 4.912, "grad_norm": 2.275578259047212, "kl": 1.4638671875, "learning_rate": 1.6814026921861335e-07, "loss": 0.0015, "reward": 0.8750000298023224, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 309 }, { "completion_length": 31.604166984558105, "epoch": 4.928, "grad_norm": 6.885099376296929, "kl": 2.74658203125, "learning_rate": 1.6661189208729489e-07, "loss": 0.0027, "reward": 1.2083333432674408, "reward_std": 0.24966806173324585, "rewards/correct_code_reward_func": 0.7083333432674408, "rewards/len_reward_func": 0.5, "step": 310 }, { "completion_length": 29.666667938232422, "epoch": 4.944, "grad_norm": 6.096335936871119, "kl": 0.33837890625, "learning_rate": 1.6508701375397486e-07, "loss": 0.0003, "reward": 0.916666716337204, "reward_std": 0.45660628378391266, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.5, "step": 311 }, { "completion_length": 40.10416793823242, "epoch": 4.96, "grad_norm": 12.76661113809146, "kl": 0.421875, "learning_rate": 1.6356569819949427e-07, "loss": 0.0004, "reward": 1.0625000596046448, "reward_std": 0.30231601744890213, "rewards/correct_code_reward_func": 0.5625000298023224, "rewards/len_reward_func": 0.5, "step": 312 }, { "completion_length": 44.64583492279053, "epoch": 4.976, "grad_norm": 2.1774262299991034, "kl": 1.03515625, "learning_rate": 1.6204800925520685e-07, "loss": 0.001, "reward": 0.8125, "reward_std": 0.28126102685928345, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.5, "step": 313 }, { "completion_length": 32.479166984558105, "epoch": 4.992, "grad_norm": 1.1114158577515365, "kl": 0.85546875, "learning_rate": 1.6053401060030097e-07, "loss": 0.0009, "reward": 0.854166716337204, "reward_std": 0.175345279276371, "rewards/correct_code_reward_func": 0.354166679084301, "rewards/len_reward_func": 0.5, "step": 314 }, { "completion_length": 23.95833396911621, "epoch": 5.0, "grad_norm": 1.1114158577515365, "kl": 0.486328125, "learning_rate": 1.5902376575912814e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.17251639068126678, "rewards/correct_code_reward_func": 0.875, "rewards/len_reward_func": 0.5, "step": 315 }, { "completion_length": 27.729167938232422, "epoch": 5.016, "grad_norm": 3.4863048431802666, "kl": 0.4541015625, "learning_rate": 1.57517338098537e-07, "loss": 0.0005, "reward": 0.9583333730697632, "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.5, "step": 316 }, { "completion_length": 45.79166793823242, "epoch": 5.032, "grad_norm": 2.9450234856850557, "kl": 0.48193359375, "learning_rate": 1.5601479082521525e-07, "loss": 0.0005, "reward": 0.8541666865348816, "reward_std": 0.25392838567495346, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.5, "step": 317 }, { "completion_length": 37.645835876464844, "epoch": 5.048, "grad_norm": 2.365992710448499, "kl": 0.2646484375, "learning_rate": 1.545161869830371e-07, "loss": 0.0003, "reward": 0.9583333730697632, "reward_std": 0.2840898931026459, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 318 }, { "completion_length": 56.79166793823242, "epoch": 5.064, "grad_norm": 1.7294710190023255, "kl": 0.18359375, "learning_rate": 1.5302158945041837e-07, "loss": 0.0002, "reward": 0.7899168729782104, "reward_std": 0.22728458046913147, "rewards/correct_code_reward_func": 0.2916666679084301, "rewards/len_reward_func": 0.49825021624565125, "step": 319 }, { "completion_length": 20.89583396911621, "epoch": 5.08, "grad_norm": 16.06787787034615, "kl": 0.431640625, "learning_rate": 1.5153106093767825e-07, "loss": 0.0004, "reward": 0.6458333432674408, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.5, "step": 320 }, { "completion_length": 37.72916793823242, "epoch": 5.096, "grad_norm": 4.822046610306533, "kl": 0.375, "learning_rate": 1.5004466398440773e-07, "loss": 0.0004, "reward": 0.916666716337204, "reward_std": 0.19500279426574707, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.4791666716337204, "step": 321 }, { "completion_length": 67.45833587646484, "epoch": 5.112, "grad_norm": 4.887246928042618, "kl": 0.29296875, "learning_rate": 1.4856246095684622e-07, "loss": 0.0003, "reward": 0.8657760918140411, "reward_std": 0.26504893600940704, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.46994274854660034, "step": 322 }, { "completion_length": 49.12500190734863, "epoch": 5.128, "grad_norm": 2.807742783582148, "kl": 0.2431640625, "learning_rate": 1.4708451404526407e-07, "loss": 0.0002, "reward": 1.0833333730697632, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.583333358168602, "rewards/len_reward_func": 0.5, "step": 323 }, { "completion_length": 19.916667938232422, "epoch": 5.144, "grad_norm": 6.084536093056242, "kl": 1.8671875, "learning_rate": 1.4561088526135374e-07, "loss": 0.0019, "reward": 0.75, "reward_std": 0.1451837718486786, "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.5, "step": 324 }, { "completion_length": 54.52083396911621, "epoch": 5.16, "grad_norm": 5.422926804352802, "kl": 4.15625, "learning_rate": 1.4414163643562753e-07, "loss": 0.0042, "reward": 0.5833333432674408, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.08333333395421505, "rewards/len_reward_func": 0.5, "step": 325 }, { "completion_length": 33.04166793823242, "epoch": 5.176, "grad_norm": 5.605984792393882, "kl": 0.18505859375, "learning_rate": 1.4267682921482356e-07, "loss": 0.0002, "reward": 1.0625000596046448, "reward_std": 0.2658637687563896, "rewards/correct_code_reward_func": 0.5625000149011612, "rewards/len_reward_func": 0.5, "step": 326 }, { "completion_length": 34.56250190734863, "epoch": 5.192, "grad_norm": 11.103407207018604, "kl": 0.2412109375, "learning_rate": 1.4121652505931918e-07, "loss": 0.0002, "reward": 0.8541666865348816, "reward_std": 0.33592626452445984, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.5, "step": 327 }, { "completion_length": 34.89583492279053, "epoch": 5.208, "grad_norm": 5.289761368392477, "kl": 1.3271484375, "learning_rate": 1.3976078524055203e-07, "loss": 0.0013, "reward": 0.9074198007583618, "reward_std": 0.31024400889873505, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.4907531142234802, "step": 328 }, { "completion_length": 42.91666793823242, "epoch": 5.224, "grad_norm": 4.462596101311776, "kl": 1.146484375, "learning_rate": 1.383096708384494e-07, "loss": 0.0011, "reward": 0.6458333432674408, "reward_std": 0.1753452718257904, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.5, "step": 329 }, { "completion_length": 35.52083492279053, "epoch": 5.24, "grad_norm": 1.0563895878531222, "kl": 0.5283203125, "learning_rate": 1.3686324273886528e-07, "loss": 0.0005, "reward": 0.9791666865348816, "reward_std": 0.175345279276371, "rewards/correct_code_reward_func": 0.4791666716337204, "rewards/len_reward_func": 0.5, "step": 330 }, { "completion_length": 37.64583396911621, "epoch": 5.256, "grad_norm": 5.06525759682497, "kl": 0.2841796875, "learning_rate": 1.354215616310258e-07, "loss": 0.0003, "reward": 1.1250000596046448, "reward_std": 0.19500280916690826, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 331 }, { "completion_length": 38.58333396911621, "epoch": 5.272, "grad_norm": 9.911669806473292, "kl": 0.388671875, "learning_rate": 1.339846880049829e-07, "loss": 0.0004, "reward": 0.9583333730697632, "reward_std": 0.2357022613286972, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.5, "step": 332 }, { "completion_length": 28.437501907348633, "epoch": 5.288, "grad_norm": 3.57323242715857, "kl": 1.20703125, "learning_rate": 1.325526821490761e-07, "loss": 0.0012, "reward": 0.8958333432674408, "reward_std": 0.2041093371808529, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 333 }, { "completion_length": 49.60416793823242, "epoch": 5.304, "grad_norm": 6.351617997400021, "kl": 0.169921875, "learning_rate": 1.3112560414740313e-07, "loss": 0.0002, "reward": 0.6666666865348816, "reward_std": 0.24339044094085693, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.5, "step": 334 }, { "completion_length": 27.89583396911621, "epoch": 5.32, "grad_norm": 1.227821200402909, "kl": 0.3876953125, "learning_rate": 1.2970351387729872e-07, "loss": 0.0004, "reward": 1.1250000596046448, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.6250000149011612, "rewards/len_reward_func": 0.5, "step": 335 }, { "completion_length": 35.479166984558105, "epoch": 5.336, "grad_norm": 6.4331146800412515, "kl": 0.58984375, "learning_rate": 1.2828647100682261e-07, "loss": 0.0006, "reward": 0.8541666865348816, "reward_std": 0.13607725501060486, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.5, "step": 336 }, { "completion_length": 36.41666793823242, "epoch": 5.352, "grad_norm": 403.82773226092394, "kl": 451.5, "learning_rate": 1.2687453499225546e-07, "loss": 0.4527, "reward": 0.9583333432674408, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.4791666716337204, "step": 337 }, { "completion_length": 41.0, "epoch": 5.368, "grad_norm": 108.94594158857709, "kl": 0.30615234375, "learning_rate": 1.2546776507560467e-07, "loss": 0.0003, "reward": 1.0833333730697632, "reward_std": 0.376638799905777, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.5, "step": 338 }, { "completion_length": 43.87500190734863, "epoch": 5.384, "grad_norm": 5.008994119057386, "kl": 0.23828125, "learning_rate": 1.2406622028211843e-07, "loss": 0.0002, "reward": 0.8333333432674408, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.4791666716337204, "step": 339 }, { "completion_length": 27.58333396911621, "epoch": 5.4, "grad_norm": 5.089868304364812, "kl": 0.3525390625, "learning_rate": 1.2266995941780933e-07, "loss": 0.0004, "reward": 0.8333333432674408, "reward_std": 0.2342708557844162, "rewards/correct_code_reward_func": 0.3333333358168602, "rewards/len_reward_func": 0.5, "step": 340 }, { "completion_length": 33.333335876464844, "epoch": 5.416, "grad_norm": 5.28992518624715, "kl": 0.41015625, "learning_rate": 1.2127904106698665e-07, "loss": 0.0004, "reward": 1.1875000596046448, "reward_std": 0.3310800790786743, "rewards/correct_code_reward_func": 0.6875000298023224, "rewards/len_reward_func": 0.5, "step": 341 }, { "completion_length": 53.68750286102295, "epoch": 5.432, "grad_norm": 6.798840307980019, "kl": 0.478515625, "learning_rate": 1.1989352358979888e-07, "loss": 0.0005, "reward": 0.8114994466304779, "reward_std": 0.39734339714050293, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.4781661033630371, "step": 342 }, { "completion_length": 46.29166793823242, "epoch": 5.448, "grad_norm": 2.7139372433900153, "kl": 0.326171875, "learning_rate": 1.1851346511978424e-07, "loss": 0.0003, "reward": 0.8063492178916931, "reward_std": 0.15347431600093842, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.4938492029905319, "step": 343 }, { "completion_length": 32.937500953674316, "epoch": 5.464, "grad_norm": 12.07849541059564, "kl": 0.2080078125, "learning_rate": 1.1713892356143238e-07, "loss": 0.0002, "reward": 0.8125000298023224, "reward_std": 0.22516433894634247, "rewards/correct_code_reward_func": 0.3125000074505806, "rewards/len_reward_func": 0.5, "step": 344 }, { "completion_length": 25.104166984558105, "epoch": 5.48, "grad_norm": 19.666094571207797, "kl": 0.6865234375, "learning_rate": 1.1576995658775404e-07, "loss": 0.0007, "reward": 0.916666716337204, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.5, "step": 345 }, { "completion_length": 22.437500953674316, "epoch": 5.496, "grad_norm": 7.745046511847673, "kl": 4.3095703125, "learning_rate": 1.1440662163786166e-07, "loss": 0.0043, "reward": 1.2083333730697632, "reward_std": 0.2903675250709057, "rewards/correct_code_reward_func": 0.708333358168602, "rewards/len_reward_func": 0.5, "step": 346 }, { "completion_length": 68.66666793823242, "epoch": 5.5120000000000005, "grad_norm": 1.9069458533343282, "kl": 0.35986328125, "learning_rate": 1.1304897591455928e-07, "loss": 0.0004, "reward": 0.8611658215522766, "reward_std": 0.3323476314544678, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.4861658066511154, "step": 347 }, { "completion_length": 56.56250190734863, "epoch": 5.5280000000000005, "grad_norm": 11.309145499622288, "kl": 2.7958984375, "learning_rate": 1.1169707638194237e-07, "loss": 0.0028, "reward": 0.9375000596046448, "reward_std": 0.24056155234575272, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.5, "step": 348 }, { "completion_length": 19.687500953674316, "epoch": 5.5440000000000005, "grad_norm": 4.48965441839464, "kl": 0.3076171875, "learning_rate": 1.103509797630077e-07, "loss": 0.0003, "reward": 1.1041667461395264, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 349 }, { "completion_length": 48.33333396911621, "epoch": 5.5600000000000005, "grad_norm": 0.03083945474392255, "kl": 24.33740234375, "learning_rate": 1.0901074253727336e-07, "loss": 0.0243, "reward": 0.7916666865348816, "reward_std": 0.24966806173324585, "rewards/correct_code_reward_func": 0.2916666865348816, "rewards/len_reward_func": 0.5, "step": 350 }, { "completion_length": 32.27083492279053, "epoch": 5.576, "grad_norm": 19.443719364276035, "kl": 0.41796875, "learning_rate": 1.0767642093840932e-07, "loss": 0.0004, "reward": 1.0, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.520833333954215, "rewards/len_reward_func": 0.4791666716337204, "step": 351 }, { "completion_length": 55.39583396911621, "epoch": 5.592, "grad_norm": 3.0676472737509886, "kl": 0.328125, "learning_rate": 1.0634807095187737e-07, "loss": 0.0003, "reward": 0.7708333432674408, "reward_std": 0.25392837077379227, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 352 }, { "completion_length": 26.125000953674316, "epoch": 5.608, "grad_norm": 2.15117359028803, "kl": 0.3544921875, "learning_rate": 1.0502574831258257e-07, "loss": 0.0004, "reward": 0.8958333730697632, "reward_std": 0.23144196718931198, "rewards/correct_code_reward_func": 0.395833358168602, "rewards/len_reward_func": 0.5, "step": 353 }, { "completion_length": 26.729167938232422, "epoch": 5.624, "grad_norm": 2.1508382760067555, "kl": 0.99462890625, "learning_rate": 1.0370950850253449e-07, "loss": 0.001, "reward": 1.1666666865348816, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.6666666865348816, "rewards/len_reward_func": 0.5, "step": 354 }, { "completion_length": 25.166666984558105, "epoch": 5.64, "grad_norm": 1.609591950875138, "kl": 0.26611328125, "learning_rate": 1.0239940674851941e-07, "loss": 0.0003, "reward": 1.041666716337204, "reward_std": 0.24966806918382645, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 355 }, { "completion_length": 72.22916793823242, "epoch": 5.656, "grad_norm": 2.9309105849693218, "kl": 0.1826171875, "learning_rate": 1.0109549801978304e-07, "loss": 0.0002, "reward": 0.625, "reward_std": 0.16623875498771667, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.5, "step": 356 }, { "completion_length": 45.062500953674316, "epoch": 5.672, "grad_norm": 4.139961399350185, "kl": 0.486328125, "learning_rate": 9.979783702572411e-08, "loss": 0.0005, "reward": 1.1041666865348816, "reward_std": 0.5021650195121765, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 357 }, { "completion_length": 51.70833396911621, "epoch": 5.688, "grad_norm": 3.6459872360115355, "kl": 0.19677734375, "learning_rate": 9.850647821359917e-08, "loss": 0.0002, "reward": 0.875, "reward_std": 0.31142252683639526, "rewards/correct_code_reward_func": 0.3958333358168602, "rewards/len_reward_func": 0.4791666716337204, "step": 358 }, { "completion_length": 23.02083396911621, "epoch": 5.704, "grad_norm": 8.882333735346968, "kl": 0.27685546875, "learning_rate": 9.722147576623744e-08, "loss": 0.0003, "reward": 1.2291666865348816, "reward_std": 0.1480126492679119, "rewards/correct_code_reward_func": 0.7291666865348816, "rewards/len_reward_func": 0.5, "step": 359 }, { "completion_length": 22.500000953674316, "epoch": 5.72, "grad_norm": 1.769657123945252, "kl": 0.3662109375, "learning_rate": 9.594288359976815e-08, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.23144196718931198, "rewards/correct_code_reward_func": 0.7083333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 360 }, { "completion_length": 31.979167938232422, "epoch": 5.736, "grad_norm": 1.341039622679155, "kl": 0.4580078125, "learning_rate": 9.467075536135785e-08, "loss": 0.0005, "reward": 0.6041666865348816, "reward_std": 0.14801263809204102, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.5, "step": 361 }, { "completion_length": 61.875, "epoch": 5.752, "grad_norm": 6.078841666686185, "kl": 2.7958984375, "learning_rate": 9.340514442695952e-08, "loss": 0.0028, "reward": 0.7839381992816925, "reward_std": 0.23519539088010788, "rewards/correct_code_reward_func": 0.3125000074505806, "rewards/len_reward_func": 0.4714381694793701, "step": 362 }, { "completion_length": 29.25, "epoch": 5.768, "grad_norm": 0.5338505194219121, "kl": 0.32421875, "learning_rate": 9.214610389907326e-08, "loss": 0.0003, "reward": 0.7083333730697632, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.2083333395421505, "rewards/len_reward_func": 0.5, "step": 363 }, { "completion_length": 34.52083396911621, "epoch": 5.784, "grad_norm": 1.9529867884255245, "kl": 0.2939453125, "learning_rate": 9.089368660451798e-08, "loss": 0.0003, "reward": 0.8541666865348816, "reward_std": 0.23144195601344109, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.5, "step": 364 }, { "completion_length": 36.06250190734863, "epoch": 5.8, "grad_norm": 4.331896756750715, "kl": 0.27197265625, "learning_rate": 8.964794509221507e-08, "loss": 0.0003, "reward": 1.1458333730697632, "reward_std": 0.175345279276371, "rewards/correct_code_reward_func": 0.6458333432674408, "rewards/len_reward_func": 0.5, "step": 365 }, { "completion_length": 30.479166984558105, "epoch": 5.816, "grad_norm": 1.170767590853997, "kl": 5.4716796875, "learning_rate": 8.840893163098332e-08, "loss": 0.0055, "reward": 0.8333333432674408, "reward_std": 0.2342708557844162, "rewards/correct_code_reward_func": 0.3333333358168602, "rewards/len_reward_func": 0.5, "step": 366 }, { "completion_length": 36.854166984558105, "epoch": 5.832, "grad_norm": 4.742230256398718, "kl": 0.53125, "learning_rate": 8.717669820734619e-08, "loss": 0.0005, "reward": 0.8750000298023224, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 367 }, { "completion_length": 29.041667938232422, "epoch": 5.848, "grad_norm": 3.0949748155814967, "kl": 0.8212890625, "learning_rate": 8.595129652335017e-08, "loss": 0.0008, "reward": 0.8174912929534912, "reward_std": 0.30320997536182404, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.484157919883728, "step": 368 }, { "completion_length": 42.04166793823242, "epoch": 5.864, "grad_norm": 2.5673081335228805, "kl": 0.25244140625, "learning_rate": 8.473277799439568e-08, "loss": 0.0003, "reward": 0.7916666865348816, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 369 }, { "completion_length": 21.416666984558105, "epoch": 5.88, "grad_norm": 8.714583282604275, "kl": 0.5546875, "learning_rate": 8.352119374707977e-08, "loss": 0.0006, "reward": 1.2916666865348816, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.7916666865348816, "rewards/len_reward_func": 0.5, "step": 370 }, { "completion_length": 45.1875, "epoch": 5.896, "grad_norm": 3.7959594929992884, "kl": 0.26953125, "learning_rate": 8.23165946170509e-08, "loss": 0.0003, "reward": 0.875, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 371 }, { "completion_length": 33.22916793823242, "epoch": 5.912, "grad_norm": 2.6612237762750697, "kl": 0.349609375, "learning_rate": 8.11190311468759e-08, "loss": 0.0004, "reward": 1.1666666865348816, "reward_std": 0.2342708334326744, "rewards/correct_code_reward_func": 0.6666666716337204, "rewards/len_reward_func": 0.5, "step": 372 }, { "completion_length": 36.47916793823242, "epoch": 5.928, "grad_norm": 1.6123592594099534, "kl": 0.23291015625, "learning_rate": 7.992855358391967e-08, "loss": 0.0002, "reward": 0.8750000298023224, "reward_std": 0.20693820342421532, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 373 }, { "completion_length": 44.54166793823242, "epoch": 5.944, "grad_norm": 7.71060276672806, "kl": 0.27490234375, "learning_rate": 7.87452118782363e-08, "loss": 0.0003, "reward": 0.791666716337204, "reward_std": 0.1451837606728077, "rewards/correct_code_reward_func": 0.29166667722165585, "rewards/len_reward_func": 0.5, "step": 374 }, { "completion_length": 29.4375, "epoch": 5.96, "grad_norm": 2.403326466418192, "kl": 0.6865234375, "learning_rate": 7.756905568047392e-08, "loss": 0.0007, "reward": 0.854166716337204, "reward_std": 0.25392838940024376, "rewards/correct_code_reward_func": 0.35416667722165585, "rewards/len_reward_func": 0.5, "step": 375 }, { "completion_length": 22.916667938232422, "epoch": 5.976, "grad_norm": 1.6436335508907451, "kl": 1.37890625, "learning_rate": 7.640013433979093e-08, "loss": 0.0014, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.5, "step": 376 }, { "completion_length": 34.08333396911621, "epoch": 5.992, "grad_norm": 5.19625049583634, "kl": 0.3408203125, "learning_rate": 7.523849690178566e-08, "loss": 0.0003, "reward": 0.8958333730697632, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 377 }, { "completion_length": 18.916667938232422, "epoch": 6.0, "grad_norm": 0.03498244958553028, "kl": 0.35546875, "learning_rate": 7.408419210643846e-08, "loss": 0.0002, "reward": 1.1666667461395264, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.6666666865348816, "rewards/len_reward_func": 0.5, "step": 378 }, { "completion_length": 37.250000953674316, "epoch": 6.016, "grad_norm": 5.584972835115381, "kl": 2.3876953125, "learning_rate": 7.293726838606673e-08, "loss": 0.0024, "reward": 0.8750000596046448, "reward_std": 0.2994871214032173, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 379 }, { "completion_length": 32.1875, "epoch": 6.032, "grad_norm": 2.6356609513901277, "kl": 0.33203125, "learning_rate": 7.179777386329275e-08, "loss": 0.0003, "reward": 1.1250000596046448, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 380 }, { "completion_length": 32.60416793823242, "epoch": 6.048, "grad_norm": 1.625137414185801, "kl": 0.345703125, "learning_rate": 7.066575634902435e-08, "loss": 0.0003, "reward": 0.6666666865348816, "reward_std": 0.17251639068126678, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.5, "step": 381 }, { "completion_length": 45.770835876464844, "epoch": 6.064, "grad_norm": 1.869234906894, "kl": 0.248046875, "learning_rate": 6.954126334044949e-08, "loss": 0.0002, "reward": 0.9583333432674408, "reward_std": 0.320542111992836, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 382 }, { "completion_length": 32.89583396911621, "epoch": 6.08, "grad_norm": 4.898274958092342, "kl": 0.26123046875, "learning_rate": 6.842434201904255e-08, "loss": 0.0003, "reward": 0.9791666865348816, "reward_std": 0.4130779355764389, "rewards/correct_code_reward_func": 0.4791666716337204, "rewards/len_reward_func": 0.5, "step": 383 }, { "completion_length": 39.60416793823242, "epoch": 6.096, "grad_norm": 2.7651784941334285, "kl": 0.3056640625, "learning_rate": 6.731503924858516e-08, "loss": 0.0003, "reward": 1.0833333730697632, "reward_std": 0.20693820342421532, "rewards/correct_code_reward_func": 0.583333358168602, "rewards/len_reward_func": 0.5, "step": 384 }, { "completion_length": 44.93750190734863, "epoch": 6.112, "grad_norm": 4.863219550347464, "kl": 6.7158203125, "learning_rate": 6.621340157319996e-08, "loss": 0.0067, "reward": 0.6875000298023224, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.18750000558793545, "rewards/len_reward_func": 0.5, "step": 385 }, { "completion_length": 42.58333396911621, "epoch": 6.128, "grad_norm": 2.178001546252181, "kl": 0.287109375, "learning_rate": 6.511947521539737e-08, "loss": 0.0003, "reward": 0.8125000298023224, "reward_std": 0.3142513930797577, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.5, "step": 386 }, { "completion_length": 22.979166984558105, "epoch": 6.144, "grad_norm": 1.6068071891620737, "kl": 0.37109375, "learning_rate": 6.403330607413643e-08, "loss": 0.0004, "reward": 0.9583333432674408, "reward_std": 0.28408990800380707, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 387 }, { "completion_length": 23.604166984558105, "epoch": 6.16, "grad_norm": 3.9928344479996607, "kl": 0.3447265625, "learning_rate": 6.295493972289903e-08, "loss": 0.0003, "reward": 1.0000000596046448, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.5000000149011612, "rewards/len_reward_func": 0.5, "step": 388 }, { "completion_length": 33.89583396911621, "epoch": 6.176, "grad_norm": 2.632692948523292, "kl": 0.3388671875, "learning_rate": 6.188442140777742e-08, "loss": 0.0003, "reward": 0.6666666865348816, "reward_std": 0.15430335700511932, "rewards/correct_code_reward_func": 0.1666666679084301, "rewards/len_reward_func": 0.5, "step": 389 }, { "completion_length": 28.166667938232422, "epoch": 6.192, "grad_norm": 8.350787780570107, "kl": 5.517578125, "learning_rate": 6.082179604557616e-08, "loss": 0.0055, "reward": 1.0416666865348816, "reward_std": 0.1451837718486786, "rewards/correct_code_reward_func": 0.5416666716337204, "rewards/len_reward_func": 0.5, "step": 390 }, { "completion_length": 19.979166984558105, "epoch": 6.208, "grad_norm": 21.31957241329922, "kl": 0.837890625, "learning_rate": 5.976710822192721e-08, "loss": 0.0008, "reward": 1.1666666865348816, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.6666666716337204, "rewards/len_reward_func": 0.5, "step": 391 }, { "completion_length": 46.25000190734863, "epoch": 6.224, "grad_norm": 2.4315634778894886, "kl": 0.30615234375, "learning_rate": 5.8720402189419286e-08, "loss": 0.0003, "reward": 0.6666666865348816, "reward_std": 0.2342708334326744, "rewards/correct_code_reward_func": 0.1875000074505806, "rewards/len_reward_func": 0.4791666716337204, "step": 392 }, { "completion_length": 79.85416793823242, "epoch": 6.24, "grad_norm": 2.8323358281325715, "kl": 3.1953125, "learning_rate": 5.768172186574122e-08, "loss": 0.0032, "reward": 0.7708333432674408, "reward_std": 0.28126102685928345, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.4791666716337204, "step": 393 }, { "completion_length": 26.14583396911621, "epoch": 6.256, "grad_norm": 6.919623616583594, "kl": 1.27734375, "learning_rate": 5.6651110831839046e-08, "loss": 0.0013, "reward": 1.2291667461395264, "reward_std": 0.28126100450754166, "rewards/correct_code_reward_func": 0.7291666865348816, "rewards/len_reward_func": 0.5, "step": 394 }, { "completion_length": 24.5, "epoch": 6.272, "grad_norm": 30.75145450155544, "kl": 1.65966796875, "learning_rate": 5.5628612330087724e-08, "loss": 0.0017, "reward": 1.1250000298023224, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 395 }, { "completion_length": 44.16666793823242, "epoch": 6.288, "grad_norm": 2.646012784054006, "kl": 0.2724609375, "learning_rate": 5.461426926247639e-08, "loss": 0.0003, "reward": 0.8958333730697632, "reward_std": 0.22516431659460068, "rewards/correct_code_reward_func": 0.395833358168602, "rewards/len_reward_func": 0.5, "step": 396 }, { "completion_length": 18.416667938232422, "epoch": 6.304, "grad_norm": 1.4581978595368466, "kl": 0.4716796875, "learning_rate": 5.360812418880883e-08, "loss": 0.0005, "reward": 1.1041666865348816, "reward_std": 0.2041093371808529, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 397 }, { "completion_length": 48.479169845581055, "epoch": 6.32, "grad_norm": 152.66305944633206, "kl": 58.0390625, "learning_rate": 5.261021932491713e-08, "loss": 0.0578, "reward": 0.9166666865348816, "reward_std": 0.30860670655965805, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.5, "step": 398 }, { "completion_length": 32.729166984558105, "epoch": 6.336, "grad_norm": 7.835361687841303, "kl": 0.435546875, "learning_rate": 5.162059654089082e-08, "loss": 0.0004, "reward": 0.7291666865348816, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.2291666716337204, "rewards/len_reward_func": 0.5, "step": 399 }, { "completion_length": 71.20833587646484, "epoch": 6.352, "grad_norm": 1.2458790767126353, "kl": 2.98828125, "learning_rate": 5.0639297359319846e-08, "loss": 0.003, "reward": 0.8958333730697632, "reward_std": 0.30859363824129105, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 400 }, { "completion_length": 35.10416793823242, "epoch": 6.368, "grad_norm": 35.48307958654518, "kl": 22.49560546875, "learning_rate": 4.9666362953552534e-08, "loss": 0.0224, "reward": 0.8958333432674408, "reward_std": 0.28126100823283195, "rewards/correct_code_reward_func": 0.39583333395421505, "rewards/len_reward_func": 0.5, "step": 401 }, { "completion_length": 33.83333396911621, "epoch": 6.384, "grad_norm": 1.9125737436183505, "kl": 0.8876953125, "learning_rate": 4.870183414596793e-08, "loss": 0.0009, "reward": 1.0000000298023224, "reward_std": 0.2342708557844162, "rewards/correct_code_reward_func": 0.5000000223517418, "rewards/len_reward_func": 0.5, "step": 402 }, { "completion_length": 28.041667938232422, "epoch": 6.4, "grad_norm": 1.72406073879926, "kl": 0.3076171875, "learning_rate": 4.774575140626316e-08, "loss": 0.0003, "reward": 1.2708333730697632, "reward_std": 0.16340987384319305, "rewards/correct_code_reward_func": 0.7708333432674408, "rewards/len_reward_func": 0.5, "step": 403 }, { "completion_length": 25.5625, "epoch": 6.416, "grad_norm": 1.7825264711628441, "kl": 0.2783203125, "learning_rate": 4.679815484975505e-08, "loss": 0.0003, "reward": 1.2083333730697632, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.7083333432674408, "rewards/len_reward_func": 0.5, "step": 404 }, { "completion_length": 24.250000953674316, "epoch": 6.432, "grad_norm": 18.631693384669017, "kl": 3.8271484375, "learning_rate": 4.5859084235697235e-08, "loss": 0.0038, "reward": 1.2708333730697632, "reward_std": 0.3584126979112625, "rewards/correct_code_reward_func": 0.7708333432674408, "rewards/len_reward_func": 0.5, "step": 405 }, { "completion_length": 34.60416793823242, "epoch": 6.448, "grad_norm": 11634.914105010444, "kl": 4704.2080078125, "learning_rate": 4.492857896561203e-08, "loss": 4.6919, "reward": 0.6458333432674408, "reward_std": 0.3430154621601105, "rewards/correct_code_reward_func": 0.1666666679084301, "rewards/len_reward_func": 0.4791666716337204, "step": 406 }, { "completion_length": 48.47916793823242, "epoch": 6.464, "grad_norm": 3.031752468264475, "kl": 0.5361328125, "learning_rate": 4.4006678081636885e-08, "loss": 0.0005, "reward": 0.8750000298023224, "reward_std": 0.3493061512708664, "rewards/correct_code_reward_func": 0.3750000223517418, "rewards/len_reward_func": 0.5, "step": 407 }, { "completion_length": 19.562500953674316, "epoch": 6.48, "grad_norm": 6.574838245332442, "kl": 0.373046875, "learning_rate": 4.309342026488652e-08, "loss": 0.0004, "reward": 1.1250000298023224, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 408 }, { "completion_length": 38.375, "epoch": 6.496, "grad_norm": 7.824251153117081, "kl": 0.3798828125, "learning_rate": 4.218884383382987e-08, "loss": 0.0004, "reward": 1.0416666865348816, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.5416666716337204, "rewards/len_reward_func": 0.5, "step": 409 }, { "completion_length": 15.979166984558105, "epoch": 6.5120000000000005, "grad_norm": 8.572852559476349, "kl": 0.2822265625, "learning_rate": 4.1292986742682254e-08, "loss": 0.0003, "reward": 0.7083333432674408, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.5, "step": 410 }, { "completion_length": 27.604167938232422, "epoch": 6.5280000000000005, "grad_norm": 1.5853029349225285, "kl": 0.3447265625, "learning_rate": 4.0405886579813006e-08, "loss": 0.0003, "reward": 0.7708333730697632, "reward_std": 0.204109326004982, "rewards/correct_code_reward_func": 0.2708333395421505, "rewards/len_reward_func": 0.5, "step": 411 }, { "completion_length": 20.062500953674316, "epoch": 6.5440000000000005, "grad_norm": 5.332547426817105, "kl": 0.6708984375, "learning_rate": 3.952758056616826e-08, "loss": 0.0007, "reward": 1.2916666865348816, "reward_std": 0.352121964097023, "rewards/correct_code_reward_func": 0.7916666865348816, "rewards/len_reward_func": 0.5, "step": 412 }, { "completion_length": 44.04166793823242, "epoch": 6.5600000000000005, "grad_norm": 4.316518711611452, "kl": 0.43359375, "learning_rate": 3.8658105553709353e-08, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.28126102685928345, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 413 }, { "completion_length": 79.66666793823242, "epoch": 6.576, "grad_norm": 2.813253474512694, "kl": 0.14697265625, "learning_rate": 3.7797498023866395e-08, "loss": 0.0001, "reward": 0.8125000298023224, "reward_std": 0.3205290511250496, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.5, "step": 414 }, { "completion_length": 67.625, "epoch": 6.592, "grad_norm": 4.588432940227484, "kl": 0.67236328125, "learning_rate": 3.6945794086007705e-08, "loss": 0.0007, "reward": 0.7916666865348816, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 415 }, { "completion_length": 46.31250190734863, "epoch": 6.608, "grad_norm": 5.238227799827805, "kl": 1.3623046875, "learning_rate": 3.6103029475924727e-08, "loss": 0.0014, "reward": 0.520833358168602, "reward_std": 0.13607724383473396, "rewards/correct_code_reward_func": 0.0416666679084301, "rewards/len_reward_func": 0.4791666716337204, "step": 416 }, { "completion_length": 36.47916793823242, "epoch": 6.624, "grad_norm": 2.438161085387164, "kl": 0.93359375, "learning_rate": 3.5269239554332556e-08, "loss": 0.0009, "reward": 0.95250004529953, "reward_std": 0.16168292984366417, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.49416667222976685, "step": 417 }, { "completion_length": 27.77083396911621, "epoch": 6.64, "grad_norm": 1.0466352275241257, "kl": 0.5244140625, "learning_rate": 3.4444459305386504e-08, "loss": 0.0005, "reward": 1.0416666865348816, "reward_std": 0.20693820342421532, "rewards/correct_code_reward_func": 0.5416666716337204, "rewards/len_reward_func": 0.5, "step": 418 }, { "completion_length": 29.479167938232422, "epoch": 6.656, "grad_norm": 7.370012766463159, "kl": 0.2734375, "learning_rate": 3.362872333521388e-08, "loss": 0.0003, "reward": 0.9166666865348816, "reward_std": 0.20693820342421532, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.5, "step": 419 }, { "completion_length": 33.27083396911621, "epoch": 6.672, "grad_norm": 2.8559830160043744, "kl": 0.6416015625, "learning_rate": 3.2822065870462215e-08, "loss": 0.0006, "reward": 0.8333333432674408, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 420 }, { "completion_length": 31.041667938232422, "epoch": 6.688, "grad_norm": 20.155745472761566, "kl": 2.296875, "learning_rate": 3.2024520756863236e-08, "loss": 0.0023, "reward": 1.0833333730697632, "reward_std": 0.4657258689403534, "rewards/correct_code_reward_func": 0.583333358168602, "rewards/len_reward_func": 0.5, "step": 421 }, { "completion_length": 32.02083492279053, "epoch": 6.704, "grad_norm": 4.317130217269305, "kl": 0.2734375, "learning_rate": 3.1236121457812545e-08, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1480126492679119, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 422 }, { "completion_length": 33.104166984558105, "epoch": 6.72, "grad_norm": 2.4144976734201777, "kl": 0.26904296875, "learning_rate": 3.045690105296572e-08, "loss": 0.0003, "reward": 0.8541666865348816, "reward_std": 0.1753452718257904, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.5, "step": 423 }, { "completion_length": 23.83333396911621, "epoch": 6.736, "grad_norm": 4.26144243485857, "kl": 0.4951171875, "learning_rate": 2.9686892236850336e-08, "loss": 0.0005, "reward": 0.8750000298023224, "reward_std": 0.1451837718486786, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 424 }, { "completion_length": 52.18750286102295, "epoch": 6.752, "grad_norm": 6.4098073522343375, "kl": 0.239501953125, "learning_rate": 2.892612731749414e-08, "loss": 0.0002, "reward": 0.7500000298023224, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.2500000111758709, "rewards/len_reward_func": 0.5, "step": 425 }, { "completion_length": 32.3125, "epoch": 6.768, "grad_norm": 5.582717299169559, "kl": 1.6337890625, "learning_rate": 2.817463821506949e-08, "loss": 0.0016, "reward": 0.8125, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.5, "step": 426 }, { "completion_length": 37.45833396911621, "epoch": 6.784, "grad_norm": 9.274067601581377, "kl": 1.51171875, "learning_rate": 2.7432456460553975e-08, "loss": 0.0015, "reward": 0.6875000298023224, "reward_std": 0.2931964099407196, "rewards/correct_code_reward_func": 0.1875000074505806, "rewards/len_reward_func": 0.5, "step": 427 }, { "completion_length": 22.89583396911621, "epoch": 6.8, "grad_norm": 2.8256870547748227, "kl": 1.974609375, "learning_rate": 2.6699613194407723e-08, "loss": 0.002, "reward": 0.8541666865348816, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.5, "step": 428 }, { "completion_length": 57.895835876464844, "epoch": 6.816, "grad_norm": 2.0942452217824026, "kl": 0.22802734375, "learning_rate": 2.5976139165266364e-08, "loss": 0.0002, "reward": 0.75, "reward_std": 0.22233544290065765, "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.5, "step": 429 }, { "completion_length": 41.645835876464844, "epoch": 6.832, "grad_norm": 3.326484576610779, "kl": 0.25634765625, "learning_rate": 2.5262064728651194e-08, "loss": 0.0003, "reward": 0.9791666865348816, "reward_std": 0.16340987384319305, "rewards/correct_code_reward_func": 0.4791666716337204, "rewards/len_reward_func": 0.5, "step": 430 }, { "completion_length": 38.41666793823242, "epoch": 6.848, "grad_norm": 0.5971645235217127, "kl": 0.2880859375, "learning_rate": 2.4557419845695427e-08, "loss": 0.0003, "reward": 1.1041667461395264, "reward_std": 0.2041093111038208, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.4791666716337204, "step": 431 }, { "completion_length": 32.20833396911621, "epoch": 6.864, "grad_norm": 3.312252032659254, "kl": 0.583984375, "learning_rate": 2.3862234081887033e-08, "loss": 0.0006, "reward": 1.0416666865348816, "reward_std": 0.3177001625299454, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 432 }, { "completion_length": 24.812500953674316, "epoch": 6.88, "grad_norm": 1.3331142703147087, "kl": 0.7646484375, "learning_rate": 2.3176536605828438e-08, "loss": 0.0008, "reward": 1.2083333730697632, "reward_std": 0.16623875498771667, "rewards/correct_code_reward_func": 0.7083333432674408, "rewards/len_reward_func": 0.5, "step": 433 }, { "completion_length": 88.10416793823242, "epoch": 6.896, "grad_norm": 5.916709269663577, "kl": 1.01611328125, "learning_rate": 2.250035618801241e-08, "loss": 0.001, "reward": 0.812254399061203, "reward_std": 0.13677212223410606, "rewards/correct_code_reward_func": 0.31250002048909664, "rewards/len_reward_func": 0.4997543394565582, "step": 434 }, { "completion_length": 42.625, "epoch": 6.912, "grad_norm": 8.385217980847418, "kl": 1.11181640625, "learning_rate": 2.183372119961499e-08, "loss": 0.0011, "reward": 1.041666716337204, "reward_std": 0.3731769770383835, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 435 }, { "completion_length": 56.395835876464844, "epoch": 6.928, "grad_norm": 10.216799500012975, "kl": 0.5478515625, "learning_rate": 2.117665961130513e-08, "loss": 0.0005, "reward": 0.9375, "reward_std": 0.28126100823283195, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 436 }, { "completion_length": 24.70833396911621, "epoch": 6.944, "grad_norm": 16.1399495650957, "kl": 0.4716796875, "learning_rate": 2.05291989920712e-08, "loss": 0.0005, "reward": 1.1458333730697632, "reward_std": 0.3142514228820801, "rewards/correct_code_reward_func": 0.645833358168602, "rewards/len_reward_func": 0.5, "step": 437 }, { "completion_length": 31.937501907348633, "epoch": 6.96, "grad_norm": 3.526111590197331, "kl": 0.62890625, "learning_rate": 1.9891366508064e-08, "loss": 0.0006, "reward": 1.1041667461395264, "reward_std": 0.23144195973873138, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 438 }, { "completion_length": 30.89583396911621, "epoch": 6.976, "grad_norm": 2.7684451481472845, "kl": 1.005859375, "learning_rate": 1.926318892145712e-08, "loss": 0.001, "reward": 0.8120748400688171, "reward_std": 0.34204548597335815, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.49957482516765594, "step": 439 }, { "completion_length": 28.625000953674316, "epoch": 6.992, "grad_norm": 10.488372974591355, "kl": 3.521484375, "learning_rate": 1.8644692589323967e-08, "loss": 0.0035, "reward": 1.2291666865348816, "reward_std": 0.37177951633930206, "rewards/correct_code_reward_func": 0.7500000298023224, "rewards/len_reward_func": 0.4791666716337204, "step": 440 }, { "completion_length": 50.79166793823242, "epoch": 7.0, "grad_norm": 10.488372974591355, "kl": 0.859375, "learning_rate": 1.803590346253195e-08, "loss": 0.0004, "reward": 0.875, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 441 }, { "completion_length": 36.10416793823242, "epoch": 7.016, "grad_norm": 14.864715045495563, "kl": 0.283203125, "learning_rate": 1.7436847084653456e-08, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.2041093371808529, "rewards/correct_code_reward_func": 0.5625, "rewards/len_reward_func": 0.5, "step": 442 }, { "completion_length": 24.4375, "epoch": 7.032, "grad_norm": 0.9322362358127333, "kl": 1.41796875, "learning_rate": 1.6847548590894434e-08, "loss": 0.0014, "reward": 1.0000000596046448, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.5000000149011612, "rewards/len_reward_func": 0.5, "step": 443 }, { "completion_length": 20.229166984558105, "epoch": 7.048, "grad_norm": 8.65798400301399, "kl": 0.55859375, "learning_rate": 1.626803270703936e-08, "loss": 0.0006, "reward": 0.9583333730697632, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.5, "step": 444 }, { "completion_length": 61.33333396911621, "epoch": 7.064, "grad_norm": 0.38359849574872695, "kl": 0.626953125, "learning_rate": 1.5698323748414122e-08, "loss": 0.0006, "reward": 0.6250000298023224, "reward_std": 0.22233545035123825, "rewards/correct_code_reward_func": 0.1250000037252903, "rewards/len_reward_func": 0.5, "step": 445 }, { "completion_length": 28.5625, "epoch": 7.08, "grad_norm": 1.1673173612412489, "kl": 0.90625, "learning_rate": 1.513844561886554e-08, "loss": 0.0009, "reward": 1.0416666865348816, "reward_std": 0.2069382146000862, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 446 }, { "completion_length": 61.833335876464844, "epoch": 7.096, "grad_norm": 16.04386512848503, "kl": 0.255859375, "learning_rate": 1.4588421809758639e-08, "loss": 0.0003, "reward": 0.8125000298023224, "reward_std": 0.16340987384319305, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.5, "step": 447 }, { "completion_length": 36.75, "epoch": 7.112, "grad_norm": 2.1279061887438573, "kl": 0.28515625, "learning_rate": 1.4048275398990894e-08, "loss": 0.0003, "reward": 0.7916666865348816, "reward_std": 0.2553258389234543, "rewards/correct_code_reward_func": 0.291666679084301, "rewards/len_reward_func": 0.5, "step": 448 }, { "completion_length": 27.229166984558105, "epoch": 7.128, "grad_norm": 1.2991397133477103, "kl": 0.34765625, "learning_rate": 1.351802905002386e-08, "loss": 0.0003, "reward": 0.875, "reward_std": 0.07715167850255966, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 449 }, { "completion_length": 33.83333396911621, "epoch": 7.144, "grad_norm": 0.8457590693731686, "kl": 0.521484375, "learning_rate": 1.2997705010932391e-08, "loss": 0.0005, "reward": 0.9375000596046448, "reward_std": 0.24056154489517212, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.5, "step": 450 }, { "completion_length": 48.95833396911621, "epoch": 7.16, "grad_norm": 43.218649853018356, "kl": 9.37255859375, "learning_rate": 1.248732511347103e-08, "loss": 0.0093, "reward": 0.7708333730697632, "reward_std": 0.22516431659460068, "rewards/correct_code_reward_func": 0.2708333395421505, "rewards/len_reward_func": 0.5, "step": 451 }, { "completion_length": 38.56250190734863, "epoch": 7.176, "grad_norm": 1.3972703569580567, "kl": 0.703125, "learning_rate": 1.1986910772158105e-08, "loss": 0.0007, "reward": 1.0000000596046448, "reward_std": 0.2342708334326744, "rewards/correct_code_reward_func": 0.5000000149011612, "rewards/len_reward_func": 0.5, "step": 452 }, { "completion_length": 51.95833396911621, "epoch": 7.192, "grad_norm": 1.115685314374603, "kl": 0.72900390625, "learning_rate": 1.1496482983377188e-08, "loss": 0.0007, "reward": 1.0236400961875916, "reward_std": 0.27332228422164917, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.48197343945503235, "step": 453 }, { "completion_length": 42.41666793823242, "epoch": 7.208, "grad_norm": 8.181481838866919, "kl": 1.8310546875, "learning_rate": 1.1016062324496007e-08, "loss": 0.0018, "reward": 0.7291666865348816, "reward_std": 0.28126102685928345, "rewards/correct_code_reward_func": 0.229166679084301, "rewards/len_reward_func": 0.5, "step": 454 }, { "completion_length": 31.687500953674316, "epoch": 7.224, "grad_norm": 3.5471143654468107, "kl": 0.75, "learning_rate": 1.054566895300324e-08, "loss": 0.0008, "reward": 1.1875, "reward_std": 0.13607724383473396, "rewards/correct_code_reward_func": 0.6875, "rewards/len_reward_func": 0.5, "step": 455 }, { "completion_length": 31.750001907348633, "epoch": 7.24, "grad_norm": 3.358209387145501, "kl": 0.62060546875, "learning_rate": 1.0085322605662666e-08, "loss": 0.0006, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.5, "step": 456 }, { "completion_length": 36.50000190734863, "epoch": 7.256, "grad_norm": 1.3699979933630457, "kl": 0.3955078125, "learning_rate": 9.635042597685023e-09, "loss": 0.0004, "reward": 1.0208333730697632, "reward_std": 0.23144196718931198, "rewards/correct_code_reward_func": 0.5208333432674408, "rewards/len_reward_func": 0.5, "step": 457 }, { "completion_length": 45.4375, "epoch": 7.272, "grad_norm": 12.90766174002275, "kl": 0.28125, "learning_rate": 9.194847821917623e-09, "loss": 0.0003, "reward": 1.0, "reward_std": 0.2994871288537979, "rewards/correct_code_reward_func": 0.5, "rewards/len_reward_func": 0.5, "step": 458 }, { "completion_length": 42.291666984558105, "epoch": 7.288, "grad_norm": 1.7345212956082203, "kl": 0.2353515625, "learning_rate": 8.764756748051661e-09, "loss": 0.0002, "reward": 0.979166716337204, "reward_std": 0.23709972202777863, "rewards/correct_code_reward_func": 0.479166679084301, "rewards/len_reward_func": 0.5, "step": 459 }, { "completion_length": 44.64583396911621, "epoch": 7.304, "grad_norm": 0.8074298440188371, "kl": 0.59375, "learning_rate": 8.344787421847216e-09, "loss": 0.0006, "reward": 0.9375000596046448, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.5, "step": 460 }, { "completion_length": 31.375000953674316, "epoch": 7.32, "grad_norm": 1.0606388187901021, "kl": 0.435546875, "learning_rate": 7.934957464376058e-09, "loss": 0.0004, "reward": 0.6458333432674408, "reward_std": 0.0589255653321743, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.5, "step": 461 }, { "completion_length": 25.000000953674316, "epoch": 7.336, "grad_norm": 1.0647975039517257, "kl": 0.4501953125, "learning_rate": 7.535284071282455e-09, "loss": 0.0005, "reward": 1.0416666865348816, "reward_std": 0.1451837718486786, "rewards/correct_code_reward_func": 0.5416666716337204, "rewards/len_reward_func": 0.5, "step": 462 }, { "completion_length": 21.750000953674316, "epoch": 7.352, "grad_norm": 0.5043304022456115, "kl": 0.34375, "learning_rate": 7.145784012061423e-09, "loss": 0.0003, "reward": 0.9583333730697632, "reward_std": 0.1451837718486786, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 463 }, { "completion_length": 40.75, "epoch": 7.368, "grad_norm": 68.80649583724717, "kl": 54.8515625, "learning_rate": 6.766473629355452e-09, "loss": 0.0548, "reward": 0.9166666865348816, "reward_std": 0.2840898931026459, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.5, "step": 464 }, { "completion_length": 53.83333396911621, "epoch": 7.384, "grad_norm": 6.320430427612535, "kl": 0.2294921875, "learning_rate": 6.397368838268496e-09, "loss": 0.0002, "reward": 0.9583333730697632, "reward_std": 0.4446708858013153, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 465 }, { "completion_length": 53.56250190734863, "epoch": 7.4, "grad_norm": 1.270276850939279, "kl": 0.21435546875, "learning_rate": 6.038485125698295e-09, "loss": 0.0002, "reward": 0.6875000298023224, "reward_std": 0.1767766959965229, "rewards/correct_code_reward_func": 0.1875000111758709, "rewards/len_reward_func": 0.5, "step": 466 }, { "completion_length": 48.18750190734863, "epoch": 7.416, "grad_norm": 5.003487718170664, "kl": 1.1865234375, "learning_rate": 5.689837549686744e-09, "loss": 0.0012, "reward": 0.8333333432674408, "reward_std": 0.27215447276830673, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 467 }, { "completion_length": 39.583335876464844, "epoch": 7.432, "grad_norm": 2.735415228101247, "kl": 0.3740234375, "learning_rate": 5.3514407387877936e-09, "loss": 0.0004, "reward": 0.5833333432674408, "reward_std": 0.1451837718486786, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.5, "step": 468 }, { "completion_length": 37.31250190734863, "epoch": 7.448, "grad_norm": 11.016089026286402, "kl": 0.24560546875, "learning_rate": 5.023308891453915e-09, "loss": 0.0002, "reward": 0.8750000298023224, "reward_std": 0.2840898856520653, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 469 }, { "completion_length": 25.89583396911621, "epoch": 7.464, "grad_norm": 3.2930129644698196, "kl": 0.3974609375, "learning_rate": 4.705455775440237e-09, "loss": 0.0004, "reward": 0.9791666865348816, "reward_std": 0.13607724383473396, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.5, "step": 470 }, { "completion_length": 37.70833396911621, "epoch": 7.48, "grad_norm": 2.9194234148211318, "kl": 3.8671875, "learning_rate": 4.3978947272269305e-09, "loss": 0.0039, "reward": 0.6458333432674408, "reward_std": 0.1753452718257904, "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.5, "step": 471 }, { "completion_length": 36.16666793823242, "epoch": 7.496, "grad_norm": 6.793516750616495, "kl": 12.18359375, "learning_rate": 4.100638651459542e-09, "loss": 0.0122, "reward": 1.0625000596046448, "reward_std": 0.39768069982528687, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 472 }, { "completion_length": 33.33333492279053, "epoch": 7.5120000000000005, "grad_norm": 3.5785167861657876, "kl": 0.4638671875, "learning_rate": 3.813700020407706e-09, "loss": 0.0005, "reward": 1.2500000596046448, "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.7500000298023224, "rewards/len_reward_func": 0.5, "step": 473 }, { "completion_length": 27.229167938232422, "epoch": 7.5280000000000005, "grad_norm": 10.521524045166144, "kl": 4.5869140625, "learning_rate": 3.5370908734417006e-09, "loss": 0.0046, "reward": 0.6041666865348816, "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.5, "step": 474 }, { "completion_length": 35.45833396911621, "epoch": 7.5440000000000005, "grad_norm": 3.8203641546567724, "kl": 3.22265625, "learning_rate": 3.2708228165273244e-09, "loss": 0.0032, "reward": 0.9583333432674408, "reward_std": 0.2903675436973572, "rewards/correct_code_reward_func": 0.4583333358168602, "rewards/len_reward_func": 0.5, "step": 475 }, { "completion_length": 30.14583396911621, "epoch": 7.5600000000000005, "grad_norm": 2.1270667018702687, "kl": 0.291015625, "learning_rate": 3.0149070217390106e-09, "loss": 0.0003, "reward": 1.3333333730697632, "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.8333333730697632, "rewards/len_reward_func": 0.5, "step": 476 }, { "completion_length": 30.27083396911621, "epoch": 7.576, "grad_norm": 3.9308692428339946, "kl": 0.375, "learning_rate": 2.769354226790893e-09, "loss": 0.0004, "reward": 0.7708333730697632, "reward_std": 0.21322892606258392, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 477 }, { "completion_length": 55.45833396911621, "epoch": 7.592, "grad_norm": 2.256041899218313, "kl": 0.3427734375, "learning_rate": 2.5341747345865026e-09, "loss": 0.0003, "reward": 1.0411083102226257, "reward_std": 0.20673340186476707, "rewards/correct_code_reward_func": 0.5625000223517418, "rewards/len_reward_func": 0.47860829532146454, "step": 478 }, { "completion_length": 34.5625, "epoch": 7.608, "grad_norm": 4.963557661177748, "kl": 1.2607421875, "learning_rate": 2.3093784127863057e-09, "loss": 0.0013, "reward": 0.8541666865348816, "reward_std": 0.30859363824129105, "rewards/correct_code_reward_func": 0.3541666679084301, "rewards/len_reward_func": 0.5, "step": 479 }, { "completion_length": 43.395835876464844, "epoch": 7.624, "grad_norm": 0.9036937312542045, "kl": 2.1318359375, "learning_rate": 2.094974693393731e-09, "loss": 0.0021, "reward": 0.75, "reward_std": 0.08908708393573761, "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.5, "step": 480 }, { "completion_length": 42.18750190734863, "epoch": 7.64, "grad_norm": 7.599576735747625, "kl": 0.255859375, "learning_rate": 1.890972572359456e-09, "loss": 0.0003, "reward": 1.1041666865348816, "reward_std": 0.320529043674469, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 481 }, { "completion_length": 41.062500953674316, "epoch": 7.656, "grad_norm": 7.576066045438129, "kl": 0.95947265625, "learning_rate": 1.6973806092038523e-09, "loss": 0.001, "reward": 0.8958333432674408, "reward_std": 0.22516432031989098, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 482 }, { "completion_length": 66.18750381469727, "epoch": 7.672, "grad_norm": 2.1174824529271934, "kl": 0.44189453125, "learning_rate": 1.514206926658046e-09, "loss": 0.0004, "reward": 0.828984946012497, "reward_std": 0.19452743232250214, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.47481827437877655, "step": 483 }, { "completion_length": 32.83333396911621, "epoch": 7.688, "grad_norm": 5.163246199946358, "kl": 3.888671875, "learning_rate": 1.3414592103228594e-09, "loss": 0.0039, "reward": 1.0833333730697632, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.5, "step": 484 }, { "completion_length": 52.87500286102295, "epoch": 7.704, "grad_norm": 2.809107387786595, "kl": 4.5390625, "learning_rate": 1.1791447083465133e-09, "loss": 0.0045, "reward": 1.0610772967338562, "reward_std": 0.31889135390520096, "rewards/correct_code_reward_func": 0.5833333730697632, "rewards/len_reward_func": 0.4777439534664154, "step": 485 }, { "completion_length": 67.33333587646484, "epoch": 7.72, "grad_norm": 8.986243345931461, "kl": 9.318359375, "learning_rate": 1.0272702311203695e-09, "loss": 0.0093, "reward": 0.9583333730697632, "reward_std": 0.34018656611442566, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 486 }, { "completion_length": 29.687501907348633, "epoch": 7.736, "grad_norm": 0.7218544621194819, "kl": 0.833984375, "learning_rate": 8.858421509933823e-10, "loss": 0.0008, "reward": 0.7291666865348816, "reward_std": 0.13607724383473396, "rewards/correct_code_reward_func": 0.2291666679084301, "rewards/len_reward_func": 0.5, "step": 487 }, { "completion_length": 31.27083396911621, "epoch": 7.752, "grad_norm": 7.305486246951896, "kl": 0.490234375, "learning_rate": 7.548664020045059e-10, "loss": 0.0005, "reward": 0.979166716337204, "reward_std": 0.21322893351316452, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.5, "step": 488 }, { "completion_length": 33.4375, "epoch": 7.768, "grad_norm": 5.374284656554601, "kl": 0.3330078125, "learning_rate": 6.343484796338394e-10, "loss": 0.0003, "reward": 0.8125000298023224, "reward_std": 0.22516432031989098, "rewards/correct_code_reward_func": 0.31250002048909664, "rewards/len_reward_func": 0.5, "step": 489 }, { "completion_length": 29.937500953674316, "epoch": 7.784, "grad_norm": 20.72618168594815, "kl": 0.4404296875, "learning_rate": 5.242934405720878e-10, "loss": 0.0004, "reward": 0.7708333432674408, "reward_std": 0.28126100823283195, "rewards/correct_code_reward_func": 0.27083333395421505, "rewards/len_reward_func": 0.5, "step": 490 }, { "completion_length": 63.47916793823242, "epoch": 7.8, "grad_norm": 7.480626745233278, "kl": 0.3974609375, "learning_rate": 4.2470590250823223e-10, "loss": 0.0004, "reward": 0.8125000298023224, "reward_std": 0.175345279276371, "rewards/correct_code_reward_func": 0.3125000074505806, "rewards/len_reward_func": 0.5, "step": 491 }, { "completion_length": 56.16666793823242, "epoch": 7.816, "grad_norm": 4.532566153481945, "kl": 0.3486328125, "learning_rate": 3.355900439359072e-10, "loss": 0.0003, "reward": 0.9583333730697632, "reward_std": 0.19500282034277916, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 492 }, { "completion_length": 42.8125, "epoch": 7.832, "grad_norm": 9.37284729343891, "kl": 0.2890625, "learning_rate": 2.569496039780683e-10, "loss": 0.0003, "reward": 0.8958333432674408, "reward_std": 0.30231600999832153, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.4791666716337204, "step": 493 }, { "completion_length": 25.291666984558105, "epoch": 7.848, "grad_norm": 8.181654775124288, "kl": 1.865234375, "learning_rate": 1.8878788223009035e-10, "loss": 0.0019, "reward": 1.2916667461395264, "reward_std": 0.24966806918382645, "rewards/correct_code_reward_func": 0.7916666865348816, "rewards/len_reward_func": 0.5, "step": 494 }, { "completion_length": 36.47916793823242, "epoch": 7.864, "grad_norm": 2.730688674099727, "kl": 0.3271484375, "learning_rate": 1.3110773862126667e-10, "loss": 0.0003, "reward": 0.9583333730697632, "reward_std": 0.19500280916690826, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 495 }, { "completion_length": 38.458335876464844, "epoch": 7.88, "grad_norm": 2.8350462414240383, "kl": 0.3203125, "learning_rate": 8.391159329496079e-11, "loss": 0.0003, "reward": 0.9583333432674408, "reward_std": 0.38511236757040024, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 496 }, { "completion_length": 22.937500953674316, "epoch": 7.896, "grad_norm": 9.26990536147569, "kl": 0.361328125, "learning_rate": 4.7201426506854324e-11, "loss": 0.0004, "reward": 1.0208333432674408, "reward_std": 0.42644475400447845, "rewards/correct_code_reward_func": 0.5208333432674408, "rewards/len_reward_func": 0.5, "step": 497 }, { "completion_length": 35.93750190734863, "epoch": 7.912, "grad_norm": 3.8907781924394014, "kl": 0.40234375, "learning_rate": 2.097877854204122e-11, "loss": 0.0004, "reward": 0.8333333730697632, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 498 }, { "completion_length": 38.645835876464844, "epoch": 7.928, "grad_norm": 13.982592715406271, "kl": 17.47998046875, "learning_rate": 5.244749650301639e-12, "loss": 0.0175, "reward": 1.041666716337204, "reward_std": 0.16623875498771667, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 499 }, { "completion_length": 29.500001907348633, "epoch": 7.944, "grad_norm": 0.028883891880915308, "kl": 2.763671875, "learning_rate": 0.0, "loss": 0.0028, "reward": 0.8333333730697632, "reward_std": 0.0, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 500 }, { "epoch": 7.944, "step": 500, "total_flos": 0.0, "train_loss": 0.019628612981648565, "train_runtime": 7912.6404, "train_samples_per_second": 0.379, "train_steps_per_second": 0.063 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }