diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7 +1,7 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 7.992, + "epoch": 7.944, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, @@ -9,6520 +9,6520 @@ "is_world_process_zero": true, "log_history": [ { - "completion_length": 178.3541717529297, + "completion_length": 450.06251525878906, "epoch": 0.016, - "grad_norm": 0.7053135404726241, + "grad_norm": 0.8495607523800367, "kl": 0.0, - "learning_rate": 1.25e-07, + "learning_rate": 3.3333333333333334e-08, "loss": 0.0, - "reward": 0.5438157171010971, - "reward_std": 0.4464171230792999, - "rewards/correct_code_reward_func": 0.2500000074505806, - "rewards/len_reward_func": 0.2938157171010971, + "reward": 0.502269446849823, + "reward_std": 0.42649583518505096, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.1064360924065113, "step": 1 }, { - "completion_length": 176.89584350585938, + "completion_length": 507.75001525878906, "epoch": 0.032, - "grad_norm": 2.3417507999605864, + "grad_norm": 1.371013897533057, "kl": 0.0, - "learning_rate": 2.5e-07, + "learning_rate": 6.666666666666667e-08, "loss": -0.0, - "reward": 0.4928237199783325, - "reward_std": 0.3440842181444168, - "rewards/correct_code_reward_func": 0.1458333432674408, - "rewards/len_reward_func": 0.34699034690856934, + "reward": 0.31612901389598846, + "reward_std": 0.5700598657131195, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.045295679941773415, "step": 2 }, { - "completion_length": 192.25, + "completion_length": 493.93751525878906, "epoch": 0.048, - "grad_norm": 0.48071888167986226, - "kl": -3.4809112548828125e-05, - "learning_rate": 3.75e-07, - "loss": -0.0, - "reward": 0.449026882648468, - "reward_std": 0.47666139900684357, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.2823602259159088, + "grad_norm": 0.5754588380314621, + "kl": 3.3274292945861816e-05, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": 0.44113826751708984, + "reward_std": 0.5670813024044037, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.06613820930942893, "step": 3 }, { - "completion_length": 270.7291717529297, + "completion_length": 530.4583435058594, "epoch": 0.064, - "grad_norm": 0.3569537292235904, - "kl": 4.172325134277344e-07, - "learning_rate": 5e-07, - "loss": 0.0, - "reward": 0.45451289415359497, - "reward_std": 0.3296015188097954, - "rewards/correct_code_reward_func": 0.1041666716337204, - "rewards/len_reward_func": 0.3503462225198746, + "grad_norm": 1.6421888296529672, + "kl": -2.0503997802734375e-05, + "learning_rate": 1.3333333333333334e-07, + "loss": -0.0, + "reward": 0.16917918622493744, + "reward_std": 0.43559131026268005, + "rewards/correct_code_reward_func": 0.125, + "rewards/len_reward_func": 0.044179188553243876, "step": 4 }, { - "completion_length": 232.58334350585938, + "completion_length": 405.43751525878906, "epoch": 0.08, - "grad_norm": 0.4439935569873824, - "kl": -4.214048385620117e-05, - "learning_rate": 4.999157413258781e-07, - "loss": -0.0, - "reward": 0.2809216380119324, - "reward_std": 0.37323758006095886, - "rewards/correct_code_reward_func": 0.0, - "rewards/len_reward_func": 0.2809216380119324, + "grad_norm": 2.4469782065215795, + "kl": 6.712973117828369e-06, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0, + "reward": 0.21715006977319717, + "reward_std": 0.3707175552845001, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.17548342049121857, "step": 5 }, { - "completion_length": 261.9375, + "completion_length": 520.6041717529297, "epoch": 0.096, - "grad_norm": 0.3541024748340305, - "kl": -3.1054019927978516e-05, - "learning_rate": 4.996630220997057e-07, + "grad_norm": 0.46306313754526307, + "kl": -2.2113323211669922e-05, + "learning_rate": 2e-07, "loss": -0.0, - "reward": 0.4689294695854187, - "reward_std": 0.4829525202512741, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.2814294695854187, + "reward": 0.37700220942497253, + "reward_std": 0.5394288003444672, + "rewards/correct_code_reward_func": 0.3541666716337204, + "rewards/len_reward_func": 0.02283555455505848, "step": 6 }, { - "completion_length": 306.12501525878906, + "completion_length": 451.5, "epoch": 0.112, - "grad_norm": 0.3947267586718111, - "kl": -2.968311309814453e-05, - "learning_rate": 4.992420126717784e-07, + "grad_norm": 2.609645604720621, + "kl": -1.2526754289865494e-05, + "learning_rate": 2.3333333333333333e-07, "loss": -0.0, - "reward": 0.3299577385187149, - "reward_std": 0.30695800483226776, - "rewards/correct_code_reward_func": 0.0, - "rewards/len_reward_func": 0.3299577236175537, + "reward": 0.20611736923456192, + "reward_std": 0.37926262617111206, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.12278405949473381, "step": 7 }, { - "completion_length": 238.85417938232422, + "completion_length": 471.4375, "epoch": 0.128, - "grad_norm": 0.5118206169412205, - "kl": -4.1604042053222656e-05, - "learning_rate": 4.986529968316653e-07, - "loss": 0.0, - "reward": 0.4368641823530197, - "reward_std": 0.2371533028781414, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.3535308539867401, + "grad_norm": 0.4804918460566338, + "kl": -2.968311309814453e-05, + "learning_rate": 2.6666666666666667e-07, + "loss": -0.0, + "reward": 0.38565604388713837, + "reward_std": 0.4431494176387787, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.0939893783070147, "step": 8 }, { - "completion_length": 304.7708435058594, + "completion_length": 463.22918701171875, "epoch": 0.144, - "grad_norm": 0.3493288643723073, - "kl": -5.060434341430664e-05, - "learning_rate": 4.978963716169165e-07, + "grad_norm": 0.3372987851073737, + "kl": 6.516464054584503e-06, + "learning_rate": 3e-07, "loss": -0.0, - "reward": 0.4636567234992981, - "reward_std": 0.46522316336631775, - "rewards/correct_code_reward_func": 0.1666666679084301, - "rewards/len_reward_func": 0.2969900518655777, + "reward": 0.44847334921360016, + "reward_std": 0.5335665941238403, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.11514000222086906, "step": 9 }, { - "completion_length": 264.4791717529297, + "completion_length": 361.62501525878906, "epoch": 0.16, - "grad_norm": 0.8959331411324142, - "kl": -7.329881191253662e-05, - "learning_rate": 4.969726470454313e-07, + "grad_norm": 0.7532184792335336, + "kl": 1.1920928955078125e-06, + "learning_rate": 3.333333333333333e-07, "loss": -0.0, - "reward": 0.5069085508584976, - "reward_std": 0.37757958471775055, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.3402418941259384, + "reward": 0.456779420375824, + "reward_std": 0.5467902272939682, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.18594606965780258, "step": 10 }, { - "completion_length": 201.3541717529297, + "completion_length": 473.2083435058594, "epoch": 0.176, - "grad_norm": 0.4500851901199424, - "kl": -4.0531158447265625e-05, - "learning_rate": 4.958824457716706e-07, + "grad_norm": 0.5536327901950325, + "kl": -1.7859041690826416e-05, + "learning_rate": 3.666666666666666e-07, "loss": -0.0, - "reward": 0.5010976195335388, - "reward_std": 0.5051684230566025, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.334430992603302, + "reward": 0.29044337570667267, + "reward_std": 0.4437016546726227, + "rewards/correct_code_reward_func": 0.1875, + "rewards/len_reward_func": 0.10294336080551147, "step": 11 }, { - "completion_length": 297.37500762939453, + "completion_length": 328.50001525878906, "epoch": 0.192, - "grad_norm": 0.43567877016900936, - "kl": -4.696846008300781e-05, - "learning_rate": 4.946265026669454e-07, - "loss": -0.0, - "reward": 0.3353367894887924, - "reward_std": 0.3315615653991699, - "rewards/correct_code_reward_func": 0.02083333395421505, - "rewards/len_reward_func": 0.3145034611225128, + "grad_norm": 1.0753885525580353, + "kl": 1.913309097290039e-05, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.2568899691104889, + "reward_std": 0.4477514177560806, + "rewards/correct_code_reward_func": 0.0625, + "rewards/len_reward_func": 0.1943899691104889, "step": 12 }, { - "completion_length": 201.75, + "completion_length": 576.0208435058594, "epoch": 0.208, - "grad_norm": 0.5760425272270367, - "kl": 2.5451183319091797e-05, - "learning_rate": 4.932056643240618e-07, + "grad_norm": 0.6665638976585759, + "kl": 1.4901161193847656e-05, + "learning_rate": 4.3333333333333335e-07, "loss": 0.0, - "reward": 0.6330613493919373, - "reward_std": 0.5388243198394775, - "rewards/correct_code_reward_func": 0.3541666716337204, - "rewards/len_reward_func": 0.27889466285705566, + "reward": 0.4603695869445801, + "reward_std": 0.5321745276451111, + "rewards/correct_code_reward_func": 0.4791666865348816, + "rewards/len_reward_func": -0.018797069787979126, "step": 13 }, { - "completion_length": 288.0416717529297, + "completion_length": 372.1458435058594, "epoch": 0.224, - "grad_norm": 0.6576192766028296, - "kl": 7.191300392150879e-05, - "learning_rate": 4.916208884866592e-07, + "grad_norm": 0.5376139925261669, + "kl": -7.539987564086914e-06, + "learning_rate": 4.6666666666666666e-07, "loss": 0.0, - "reward": 0.39246678352355957, - "reward_std": 0.44181104004383087, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.3091334402561188, + "reward": 0.35033082216978073, + "reward_std": 0.4991242587566376, + "rewards/correct_code_reward_func": 0.1875000111758709, + "rewards/len_reward_func": 0.16283082962036133, "step": 14 }, { - "completion_length": 183.75000762939453, + "completion_length": 368.0625, "epoch": 0.24, - "grad_norm": 0.6134701851816485, - "kl": 0.00014084577560424805, - "learning_rate": 4.898732434036243e-07, - "loss": 0.0, - "reward": 0.42976467311382294, - "reward_std": 0.3796156942844391, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.30476468801498413, + "grad_norm": 0.7714093926878073, + "kl": -1.0609626770019531e-05, + "learning_rate": 5e-07, + "loss": -0.0, + "reward": 0.5642553567886353, + "reward_std": 0.4422197937965393, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.18925533443689346, "step": 15 }, { - "completion_length": 207.77083587646484, + "completion_length": 355.3125, "epoch": 0.256, - "grad_norm": 0.7064422469677513, - "kl": 0.000293731689453125, - "learning_rate": 4.879639071090173e-07, + "grad_norm": 5.105471931223821, + "kl": 0.00017064809799194336, + "learning_rate": 4.999947552503497e-07, "loss": 0.0, - "reward": 0.35718174278736115, - "reward_std": 0.408719003200531, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.29468175768852234, + "reward": 0.26038650423288345, + "reward_std": 0.4197434335947037, + "rewards/correct_code_reward_func": 0.125, + "rewards/len_reward_func": 0.13538648188114166, "step": 16 }, { - "completion_length": 194.00000762939453, + "completion_length": 412.62501525878906, "epoch": 0.272, - "grad_norm": 0.4827506793012771, - "kl": 2.288818359375e-05, - "learning_rate": 4.858941666279955e-07, - "loss": -0.0, - "reward": 0.5703845322132111, - "reward_std": 0.39449170231819153, - "rewards/correct_code_reward_func": 0.2500000111758709, - "rewards/len_reward_func": 0.32038453221321106, + "grad_norm": 2.3022353306620618, + "kl": 7.212162017822266e-05, + "learning_rate": 4.999790212214579e-07, + "loss": 0.0, + "reward": 0.2577357590198517, + "reward_std": 0.4265839755535126, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.09106908613466658, "step": 17 }, { - "completion_length": 238.18750762939453, + "completion_length": 378.3333435058594, "epoch": 0.288, - "grad_norm": 0.41996129426983214, - "kl": 0.00011217594146728516, - "learning_rate": 4.836654171092682e-07, - "loss": 0.0, - "reward": 0.6033974885940552, - "reward_std": 0.43378083407878876, - "rewards/correct_code_reward_func": 0.2708333358168602, - "rewards/len_reward_func": 0.33256417512893677, + "grad_norm": 0.4875991126449715, + "kl": -3.3736228942871094e-05, + "learning_rate": 4.999527985734931e-07, + "loss": -0.0, + "reward": 0.4722817540168762, + "reward_std": 0.5266325920820236, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.18061506003141403, "step": 18 }, { - "completion_length": 233.95834350585938, + "completion_length": 344.5208435058594, "epoch": 0.304, - "grad_norm": 0.5123314162893023, - "kl": 0.00013169646263122559, - "learning_rate": 4.812791608846709e-07, + "grad_norm": 1.0396038838546207, + "kl": 0.00010180473327636719, + "learning_rate": 4.99916088406705e-07, "loss": 0.0, - "reward": 0.40910618007183075, - "reward_std": 0.3389303684234619, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.32577285170555115, + "reward": 0.3391585648059845, + "reward_std": 0.5373022556304932, + "rewards/correct_code_reward_func": 0.2291666679084301, + "rewards/len_reward_func": 0.10999187082052231, "step": 19 }, { - "completion_length": 316.3125, + "completion_length": 451.8958435058594, "epoch": 0.32, - "grad_norm": 0.47220846544005757, - "kl": 0.0002579689025878906, - "learning_rate": 4.787370064564882e-07, + "grad_norm": 2.947144360733391, + "kl": 0.0004801750183105469, + "learning_rate": 4.998688922613787e-07, "loss": 0.0, - "reward": 0.41953830420970917, - "reward_std": 0.4723764508962631, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.2737049460411072, + "reward": 0.25417882204055786, + "reward_std": 0.3951431214809418, + "rewards/correct_code_reward_func": 0.1458333395421505, + "rewards/len_reward_func": 0.10834548436105251, "step": 20 }, { - "completion_length": 227.7916717529297, + "completion_length": 351.14583587646484, "epoch": 0.336, - "grad_norm": 0.6008188696097762, - "kl": 0.0004267692565917969, - "learning_rate": 4.7604066741321253e-07, + "grad_norm": 0.3323592751582949, + "kl": 6.604194641113281e-05, + "learning_rate": 4.998112121177698e-07, "loss": 0.0, - "reward": 0.6169633865356445, - "reward_std": 0.48675188422203064, - "rewards/correct_code_reward_func": 0.3333333432674408, - "rewards/len_reward_func": 0.28363004326820374, + "reward": 0.533460944890976, + "reward_std": 0.528270423412323, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.15846090763807297, "step": 21 }, { - "completion_length": 252.27084350585938, + "completion_length": 350.62501525878906, "epoch": 0.352, - "grad_norm": 0.5160855324353004, - "kl": 0.0001907944679260254, - "learning_rate": 4.731919612744659e-07, + "grad_norm": 0.48509498294245135, + "kl": 0.00014066696166992188, + "learning_rate": 4.997430503960219e-07, "loss": 0.0, - "reward": 0.5648790299892426, - "reward_std": 0.31752249598503113, - "rewards/correct_code_reward_func": 0.18750000558793545, - "rewards/len_reward_func": 0.37737900018692017, + "reward": 0.4158083647489548, + "reward_std": 0.5351734161376953, + "rewards/correct_code_reward_func": 0.2500000074505806, + "rewards/len_reward_func": 0.16580835729837418, "step": 22 }, { - "completion_length": 199.27083587646484, + "completion_length": 392.43751525878906, "epoch": 0.368, - "grad_norm": 0.4296758598414565, - "kl": 0.000273287296295166, - "learning_rate": 4.7019280826586604e-07, + "grad_norm": 1.106216791589586, + "kl": 0.0010752677917480469, + "learning_rate": 4.996644099560641e-07, "loss": 0.0, - "reward": 0.45347318053245544, - "reward_std": 0.3857118636369705, - "rewards/correct_code_reward_func": 0.1041666716337204, - "rewards/len_reward_func": 0.34930650889873505, + "reward": 0.3105452209711075, + "reward_std": 0.45272597670555115, + "rewards/correct_code_reward_func": 0.14583333395421505, + "rewards/len_reward_func": 0.16471190005540848, "step": 23 }, { - "completion_length": 221.77083587646484, + "completion_length": 575.0833740234375, "epoch": 0.384, - "grad_norm": 1.2892124765727169, - "kl": 0.0007638931274414062, - "learning_rate": 4.6704523002466094e-07, + "grad_norm": 0.5249553524839714, + "kl": 0.0004450082778930664, + "learning_rate": 4.995752940974918e-07, "loss": 0.0, - "reward": 0.4506646543741226, - "reward_std": 0.20875498466193676, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.40899796783924103, + "reward": 0.13311870768666267, + "reward_std": 0.46341927349567413, + "rewards/correct_code_reward_func": 0.1041666679084301, + "rewards/len_reward_func": 0.028952032327651978, "step": 24 }, { - "completion_length": 266.8333435058594, + "completion_length": 332.4583435058594, "epoch": 0.4, - "grad_norm": 1.4838265312357841, - "kl": 0.0009393692016601562, - "learning_rate": 4.6375134823700503e-07, + "grad_norm": 0.8221122197576101, + "kl": 0.0007367134094238281, + "learning_rate": 4.994757065594279e-07, "loss": 0.0, - "reward": 0.4240592122077942, - "reward_std": 0.36766907572746277, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.3198925405740738, + "reward": 0.23186111450195312, + "reward_std": 0.45240356028079987, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.19019444286823273, "step": 25 }, { - "completion_length": 225.20834350585938, + "completion_length": 492.22918701171875, "epoch": 0.416, - "grad_norm": 0.45923128114507034, - "kl": 0.0007772445678710938, - "learning_rate": 4.603133832077953e-07, + "grad_norm": 0.6644343017539492, + "kl": 0.00038242340087890625, + "learning_rate": 4.993656515203662e-07, "loss": 0.0, - "reward": 0.48156437277793884, - "reward_std": 0.4315789043903351, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.33573102951049805, + "reward": 0.3881392180919647, + "reward_std": 0.45405760407447815, + "rewards/correct_code_reward_func": 0.291666679084301, + "rewards/len_reward_func": 0.09647253155708313, "step": 26 }, { - "completion_length": 127.33333587646484, + "completion_length": 404.875, "epoch": 0.432, - "grad_norm": 0.5853476018540942, - "kl": 0.00115966796875, - "learning_rate": 4.5673365236403216e-07, + "grad_norm": 1.430989391876855, + "kl": 0.0005321502685546875, + "learning_rate": 4.992451335979955e-07, "loss": 0.0, - "reward": 0.5744044184684753, - "reward_std": 0.43071019649505615, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.30357107520103455, + "reward": 0.5776854753494263, + "reward_std": 0.41993752121925354, + "rewards/correct_code_reward_func": 0.4166666865348816, + "rewards/len_reward_func": 0.16101885586977005, "step": 27 }, { - "completion_length": 273.1666717529297, + "completion_length": 399.79168701171875, "epoch": 0.448, - "grad_norm": 0.4925160720854453, - "kl": 0.0010128021240234375, - "learning_rate": 4.530145686927125e-07, + "grad_norm": 0.6573879108863315, + "kl": 0.000904083251953125, + "learning_rate": 4.991141578490066e-07, "loss": 0.0, - "reward": 0.5949445962905884, - "reward_std": 0.42838945984840393, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.3241112679243088, + "reward": 0.5069835484027863, + "reward_std": 0.4736175239086151, + "rewards/correct_code_reward_func": 0.3125000149011612, + "rewards/len_reward_func": 0.19448353350162506, "step": 28 }, { - "completion_length": 178.75000762939453, + "completion_length": 312.31251525878906, "epoch": 0.464, - "grad_norm": 1.1463419828713735, - "kl": 0.0027008056640625, - "learning_rate": 4.4915863911430897e-07, + "grad_norm": 1.8201559702600916, + "kl": 0.00235748291015625, + "learning_rate": 4.989727297688796e-07, "loss": 0.0, - "reward": 0.3950851857662201, - "reward_std": 0.4124990254640579, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.3117518424987793, + "reward": 0.35250288248062134, + "reward_std": 0.4461039751768112, + "rewards/correct_code_reward_func": 0.1458333395421505, + "rewards/len_reward_func": 0.20666955411434174, "step": 29 }, { - "completion_length": 210.0, + "completion_length": 349.7083435058594, "epoch": 0.48, - "grad_norm": 0.38472151178254205, - "kl": 0.0011115074157714844, - "learning_rate": 4.45168462792932e-07, + "grad_norm": 0.5301714114961311, + "kl": 0.000766754150390625, + "learning_rate": 4.988208552916535e-07, "loss": 0.0, - "reward": 0.48962171375751495, - "reward_std": 0.443721666932106, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.32295504212379456, + "reward": 0.4679892510175705, + "reward_std": 0.3862190693616867, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.1763225868344307, "step": 30 }, { - "completion_length": 204.95833587646484, + "completion_length": 450.2916717529297, "epoch": 0.496, - "grad_norm": 1.5507179471403478, - "kl": 0.00183868408203125, - "learning_rate": 4.4104672938431223e-07, + "grad_norm": 0.9601897979350579, + "kl": 0.0031280517578125, + "learning_rate": 4.986585407896771e-07, "loss": 0.0, - "reward": 0.6203064322471619, - "reward_std": 0.39608660340309143, - "rewards/correct_code_reward_func": 0.2916666865348816, - "rewards/len_reward_func": 0.32863976061344147, + "reward": 0.5194195210933685, + "reward_std": 0.5953386127948761, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.12358617037534714, "step": 31 }, { - "completion_length": 159.83333587646484, + "completion_length": 310.75001525878906, "epoch": 0.512, - "grad_norm": 0.9492121763047169, - "kl": 0.001636505126953125, - "learning_rate": 4.367962172227866e-07, + "grad_norm": 1.6987129262106306, + "kl": 0.006591796875, + "learning_rate": 4.984857930733419e-07, "loss": 0.0, - "reward": 0.5581120550632477, - "reward_std": 0.4106949418783188, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.3081120699644089, + "reward": 0.41354838013648987, + "reward_std": 0.47000962495803833, + "rewards/correct_code_reward_func": 0.2500000149011612, + "rewards/len_reward_func": 0.16354837268590927, "step": 32 }, { - "completion_length": 214.52083587646484, + "completion_length": 383.25001525878906, "epoch": 0.528, - "grad_norm": 1.1197088117962448, - "kl": 0.005927085876464844, - "learning_rate": 4.324197914485075e-07, + "grad_norm": 0.7545967354246785, + "kl": 0.0016632080078125, + "learning_rate": 4.98302619390796e-07, "loss": 0.0, - "reward": 0.4833623319864273, - "reward_std": 0.39532361924648285, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.3583623170852661, + "reward": 0.46299508213996887, + "reward_std": 0.5469619035720825, + "rewards/correct_code_reward_func": 0.3125, + "rewards/len_reward_func": 0.15049506723880768, "step": 33 }, { - "completion_length": 255.39583587646484, + "completion_length": 386.97918701171875, "epoch": 0.544, - "grad_norm": 0.4856761942711223, - "kl": 0.00038909912109375, - "learning_rate": 4.2792040207614e-07, + "grad_norm": 0.40693320383732745, + "kl": 0.0008487701416015625, + "learning_rate": 4.981090274276405e-07, "loss": 0.0, - "reward": 0.3419315367937088, - "reward_std": 0.23230206966400146, - "rewards/correct_code_reward_func": 0.02083333395421505, - "rewards/len_reward_func": 0.3210982233285904, + "reward": 0.36661335825920105, + "reward_std": 0.599495530128479, + "rewards/correct_code_reward_func": 0.20833333395421505, + "rewards/len_reward_func": 0.15827999636530876, "step": 34 }, { - "completion_length": 215.93750762939453, + "completion_length": 364.2083435058594, "epoch": 0.56, - "grad_norm": 0.6317607844858494, - "kl": 0.00643157958984375, - "learning_rate": 4.2330108200634723e-07, + "grad_norm": 3.038529641996517, + "kl": 0.013214111328125, + "learning_rate": 4.979050253066063e-07, "loss": 0.0, - "reward": 0.48397234082221985, - "reward_std": 0.36393553018569946, - "rewards/correct_code_reward_func": 0.1458333432674408, - "rewards/len_reward_func": 0.33813901245594025, + "reward": 0.38925309479236603, + "reward_std": 0.4519564062356949, + "rewards/correct_code_reward_func": 0.1875000111758709, + "rewards/len_reward_func": 0.20175307989120483, "step": 35 }, { - "completion_length": 170.52084350585938, + "completion_length": 223.20833587646484, "epoch": 0.576, - "grad_norm": 0.47983643730367526, - "kl": 0.0009899139404296875, - "learning_rate": 4.185649449814045e-07, + "grad_norm": 2.194175778485753, + "kl": 0.00901031494140625, + "learning_rate": 4.976906215872137e-07, "loss": 0.0, - "reward": 0.5446349233388901, - "reward_std": 0.475721538066864, - "rewards/correct_code_reward_func": 0.229166679084301, - "rewards/len_reward_func": 0.3154682517051697, + "reward": 0.5131015926599503, + "reward_std": 0.42744188010692596, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.24226826429367065, "step": 36 }, { - "completion_length": 155.41666793823242, + "completion_length": 309.8958435058594, "epoch": 0.592, - "grad_norm": 0.7182646382516522, - "kl": 0.00705718994140625, - "learning_rate": 4.137151834863213e-07, + "grad_norm": 1.0307041561865542, + "kl": 0.0041351318359375, + "learning_rate": 4.974658252654134e-07, "loss": 0.0, - "reward": 0.659955769777298, - "reward_std": 0.4905228465795517, - "rewards/correct_code_reward_func": 0.2916666865348816, - "rewards/len_reward_func": 0.3682890981435776, + "reward": 0.5382832139730453, + "reward_std": 0.49117420613765717, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.16328320652246475, "step": 37 }, { - "completion_length": 234.35416793823242, + "completion_length": 243.45833587646484, "epoch": 0.608, - "grad_norm": 0.5385402773601645, - "kl": 0.001377105712890625, - "learning_rate": 4.087550665968846e-07, + "grad_norm": 0.845257170990527, + "kl": 0.005828857421875, + "learning_rate": 4.97230645773209e-07, "loss": 0.0, - "reward": 0.34026579558849335, - "reward_std": 0.42601510882377625, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.23609913140535355, + "reward": 0.3587033599615097, + "reward_std": 0.40543846786022186, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.2545367032289505, "step": 38 }, { - "completion_length": 150.02083587646484, + "completion_length": 347.81251525878906, "epoch": 0.624, - "grad_norm": 0.8263933056203022, - "kl": 0.0065155029296875, - "learning_rate": 4.036879377760752e-07, + "grad_norm": 0.6831356250050794, + "kl": 0.0037384033203125, + "learning_rate": 4.96985092978261e-07, "loss": 0.0, - "reward": 0.6042845845222473, - "reward_std": 0.4889480620622635, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.3334512561559677, + "reward": 0.42208923399448395, + "reward_std": 0.48525381088256836, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.13042253628373146, "step": 39 }, { - "completion_length": 168.77084350585938, + "completion_length": 164.64583587646484, "epoch": 0.64, - "grad_norm": 0.6434934249206259, - "kl": 0.00261688232421875, - "learning_rate": 3.9851721262034157e-07, + "grad_norm": 0.7868888492846406, + "kl": 0.00518798828125, + "learning_rate": 4.967291771834726e-07, "loss": 0.0, - "reward": 0.4757592976093292, - "reward_std": 0.3624489903450012, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.3507593274116516, + "reward": 0.37410612404346466, + "reward_std": 0.42655548453330994, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.26993944495916367, "step": 40 }, { - "completion_length": 194.0833396911621, + "completion_length": 176.70833587646484, "epoch": 0.656, - "grad_norm": 0.5493289883401115, - "kl": 0.003704071044921875, - "learning_rate": 3.932463765572505e-07, + "grad_norm": 0.9834593603969678, + "kl": 0.011749267578125, + "learning_rate": 4.964629091265583e-07, "loss": 0.0, - "reward": 0.42005764693021774, - "reward_std": 0.37090964615345, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.25339096784591675, + "reward": 0.493656724691391, + "reward_std": 0.32837581634521484, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.285323366522789, "step": 41 }, { - "completion_length": 223.87500762939453, + "completion_length": 193.7916717529297, "epoch": 0.672, - "grad_norm": 0.43766199341669654, - "kl": 0.003650665283203125, - "learning_rate": 3.8787898249606767e-07, + "grad_norm": 0.4763159825214015, + "kl": 0.0091552734375, + "learning_rate": 4.961862999795923e-07, "loss": 0.0, - "reward": 0.35570722818374634, - "reward_std": 0.3742265850305557, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.31404057145118713, + "reward": 0.43107903003692627, + "reward_std": 0.3805217146873474, + "rewards/correct_code_reward_func": 0.10416666977107525, + "rewards/len_reward_func": 0.3269123286008835, "step": 42 }, { - "completion_length": 159.93750762939453, + "completion_length": 251.31250762939453, "epoch": 0.688, - "grad_norm": 0.662510453229864, - "kl": 0.004608154296875, - "learning_rate": 3.8241864843284964e-07, + "grad_norm": 0.926543039135598, + "kl": 0.007232666015625, + "learning_rate": 4.958993613485405e-07, "loss": 0.0, - "reward": 0.6946325600147247, - "reward_std": 0.4132131338119507, - "rewards/correct_code_reward_func": 0.3125000074505806, - "rewards/len_reward_func": 0.38213254511356354, + "reward": 0.47894081473350525, + "reward_std": 0.501429408788681, + "rewards/correct_code_reward_func": 0.3125, + "rewards/len_reward_func": 0.16644081473350525, "step": 43 }, { - "completion_length": 202.43750762939453, + "completion_length": 277.06251525878906, "epoch": 0.704, - "grad_norm": 0.4902137587205668, - "kl": 0.006561279296875, - "learning_rate": 3.768690550116639e-07, + "grad_norm": 0.4442878657111374, + "kl": 0.0048065185546875, + "learning_rate": 4.956021052727731e-07, "loss": 0.0, - "reward": 0.4793711006641388, - "reward_std": 0.4331624209880829, - "rewards/correct_code_reward_func": 0.1875000074505806, - "rewards/len_reward_func": 0.2918711006641388, + "reward": 0.4777670353651047, + "reward_std": 0.4489281326532364, + "rewards/correct_code_reward_func": 0.2500000149011612, + "rewards/len_reward_func": 0.22776702046394348, "step": 44 }, { - "completion_length": 201.52084350585938, + "completion_length": 236.25, "epoch": 0.72, - "grad_norm": 1.5244716072983422, - "kl": 0.0045623779296875, - "learning_rate": 3.712339430435792e-07, + "grad_norm": 0.8268091601529216, + "kl": 0.0167388916015625, + "learning_rate": 4.952945442245597e-07, "loss": 0.0, - "reward": 0.376946821808815, - "reward_std": 0.36263740062713623, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.2936134934425354, + "reward": 0.3185321241617203, + "reward_std": 0.3292020410299301, + "rewards/correct_code_reward_func": 0.06250000186264515, + "rewards/len_reward_func": 0.2560321241617203, "step": 45 }, { - "completion_length": 211.87501525878906, + "completion_length": 255.20834350585938, "epoch": 0.736, - "grad_norm": 0.47019285013227063, - "kl": 0.00263214111328125, - "learning_rate": 3.65517110985099e-07, + "grad_norm": 0.5352983780133181, + "kl": 0.0109405517578125, + "learning_rate": 4.949766911085461e-07, "loss": 0.0, - "reward": 0.4974285364151001, - "reward_std": 0.3727869838476181, - "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.3724285215139389, + "reward": 0.40212514996528625, + "reward_std": 0.4124213755130768, + "rewards/correct_code_reward_func": 0.16666667722165585, + "rewards/len_reward_func": 0.23545847833156586, "step": 46 }, { - "completion_length": 128.29166793823242, + "completion_length": 206.18750762939453, "epoch": 0.752, - "grad_norm": 0.5038135341546401, - "kl": 0.00347900390625, - "learning_rate": 3.597224123777389e-07, + "grad_norm": 1.0989971408848502, + "kl": 0.012786865234375, + "learning_rate": 4.946485592612122e-07, "loss": 0.0, - "reward": 0.608631819486618, - "reward_std": 0.4802524596452713, - "rewards/correct_code_reward_func": 0.3541666865348816, - "rewards/len_reward_func": 0.25446511805057526, + "reward": 0.5097324252128601, + "reward_std": 0.44956831634044647, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.23889903724193573, "step": 47 }, { - "completion_length": 161.8541717529297, + "completion_length": 129.08333587646484, "epoch": 0.768, - "grad_norm": 0.5836224104866172, - "kl": 0.0087890625, - "learning_rate": 3.5385375325047163e-07, + "grad_norm": 1.030854950339047, + "kl": 0.03948974609375, + "learning_rate": 4.943101624503132e-07, "loss": 0.0, - "reward": 0.5433830618858337, - "reward_std": 0.4709310829639435, - "rewards/correct_code_reward_func": 0.2291666679084301, - "rewards/len_reward_func": 0.31421639025211334, + "reward": 0.616540938615799, + "reward_std": 0.528562068939209, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.24154090136289597, "step": 48 }, { - "completion_length": 140.47916793823242, + "completion_length": 126.66667175292969, "epoch": 0.784, - "grad_norm": 0.49678584086470384, - "kl": 0.0029144287109375, - "learning_rate": 3.479150894867926e-07, + "grad_norm": 1.2775292444091597, + "kl": 0.0452880859375, + "learning_rate": 4.939615148743017e-07, "loss": 0.0, - "reward": 0.5527551472187042, - "reward_std": 0.5054349154233932, - "rewards/correct_code_reward_func": 0.22916667722165585, - "rewards/len_reward_func": 0.3235885202884674, + "reward": 0.5280424952507019, + "reward_std": 0.5158527791500092, + "rewards/correct_code_reward_func": 0.2708333395421505, + "rewards/len_reward_func": 0.2572091445326805, "step": 49 }, { - "completion_length": 190.87500762939453, + "completion_length": 128.93750381469727, "epoch": 0.8, - "grad_norm": 1.1393985621399754, - "kl": 0.004364013671875, - "learning_rate": 3.4191042415818e-07, + "grad_norm": 0.9671908865335344, + "kl": 0.025238037109375, + "learning_rate": 4.936026311617316e-07, "loss": 0.0, - "reward": 0.4087870866060257, - "reward_std": 0.35092872381210327, - "rewards/correct_code_reward_func": 0.1041666716337204, - "rewards/len_reward_func": 0.3046204149723053, + "reward": 0.4106781929731369, + "reward_std": 0.4047301709651947, + "rewards/correct_code_reward_func": 0.12500000558793545, + "rewards/len_reward_func": 0.2856782227754593, "step": 50 }, { - "completion_length": 189.95833587646484, + "completion_length": 173.31250762939453, "epoch": 0.816, - "grad_norm": 1.1299817912717918, - "kl": 0.01027679443359375, - "learning_rate": 3.3584380482574717e-07, - "loss": 0.0, - "reward": 0.48216497898101807, - "reward_std": 0.35060346126556396, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.29466497898101807, + "grad_norm": 0.7968037935478003, + "kl": 0.0194091796875, + "learning_rate": 4.932335263706445e-07, + "loss": 0.0, + "reward": 0.6258751451969147, + "reward_std": 0.4580596834421158, + "rewards/correct_code_reward_func": 0.458333358168602, + "rewards/len_reward_func": 0.16754178702831268, "step": 51 }, { - "completion_length": 155.20833587646484, + "completion_length": 141.7083396911621, "epoch": 0.832, - "grad_norm": 1.0099136669800142, - "kl": 0.0089263916015625, - "learning_rate": 3.297193208119047e-07, + "grad_norm": 1.006183031163246, + "kl": 0.03253173828125, + "learning_rate": 4.928542159879385e-07, "loss": 0.0, - "reward": 0.7047297656536102, - "reward_std": 0.537945419549942, - "rewards/correct_code_reward_func": 0.3750000149011612, - "rewards/len_reward_func": 0.32972970604896545, + "reward": 0.6714096814393997, + "reward_std": 0.5154012739658356, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.21307633817195892, "step": 52 }, { - "completion_length": 119.43750381469727, + "completion_length": 91.47916793823242, "epoch": 0.848, - "grad_norm": 0.6910875224957795, - "kl": 0.00537109375, - "learning_rate": 3.235411004438741e-07, + "grad_norm": 1.4433890659727795, + "kl": 0.0452880859375, + "learning_rate": 4.924647159287175e-07, "loss": 0.0, - "reward": 0.37493598461151123, - "reward_std": 0.32232099026441574, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.33326931297779083, + "reward": 0.3492863178253174, + "reward_std": 0.31160949170589447, + "rewards/correct_code_reward_func": 0.02083333395421505, + "rewards/len_reward_func": 0.3284529894590378, "step": 53 }, { - "completion_length": 239.00001525878906, + "completion_length": 89.06250381469727, "epoch": 0.864, - "grad_norm": 0.9366579226902805, - "kl": 0.01202392578125, - "learning_rate": 3.173133082709086e-07, + "grad_norm": 1.244678229537496, + "kl": 0.0367431640625, + "learning_rate": 4.920650425356239e-07, "loss": 0.0, - "reward": 0.5343351364135742, - "reward_std": 0.39477548003196716, - "rewards/correct_code_reward_func": 0.2083333358168602, - "rewards/len_reward_func": 0.32600177824497223, + "reward": 0.4604046642780304, + "reward_std": 0.3904419094324112, + "rewards/correct_code_reward_func": 0.10416666977107525, + "rewards/len_reward_func": 0.3562380075454712, "step": 54 }, { - "completion_length": 165.6458396911621, + "completion_length": 89.93750381469727, "epoch": 0.88, - "grad_norm": 0.5543105836651998, - "kl": 0.0074310302734375, - "learning_rate": 3.1104014225709784e-07, + "grad_norm": 0.9722090267696216, + "kl": 0.0489501953125, + "learning_rate": 4.916552125781528e-07, "loss": 0.0, - "reward": 0.5656554698944092, - "reward_std": 0.4120694398880005, - "rewards/correct_code_reward_func": 0.2500000111758709, - "rewards/len_reward_func": 0.315655454993248, + "reward": 0.5890590101480484, + "reward_std": 0.40695688128471375, + "rewards/correct_code_reward_func": 0.27083333395421505, + "rewards/len_reward_func": 0.3182256817817688, "step": 55 }, { - "completion_length": 224.8541717529297, + "completion_length": 129.1666717529297, "epoch": 0.896, - "grad_norm": 0.4221238954900066, - "kl": 0.0029735565185546875, - "learning_rate": 3.0472583095164873e-07, + "grad_norm": 0.8934927590767672, + "kl": 0.031707763671875, + "learning_rate": 4.912352432519484e-07, "loss": 0.0, - "reward": 0.4594677835702896, - "reward_std": 0.3312383443117142, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.3553011566400528, + "reward": 0.43804308772087097, + "reward_std": 0.3997139036655426, + "rewards/correct_code_reward_func": 0.1458333395421505, + "rewards/len_reward_func": 0.2922097444534302, "step": 56 }, { - "completion_length": 135.8541717529297, + "completion_length": 85.87500381469727, "epoch": 0.912, - "grad_norm": 0.7883647712663692, - "kl": 0.005859375, - "learning_rate": 2.983746306385499e-07, + "grad_norm": 1.059356454998566, + "kl": 0.0428466796875, + "learning_rate": 4.908051521780824e-07, "loss": 0.0, - "reward": 0.5004815310239792, - "reward_std": 0.4576799273490906, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.3546481877565384, + "reward": 0.46079379320144653, + "reward_std": 0.5179382562637329, + "rewards/correct_code_reward_func": 0.1875000111758709, + "rewards/len_reward_func": 0.2732938081026077, "step": 57 }, { - "completion_length": 177.625, + "completion_length": 56.1875, "epoch": 0.928, - "grad_norm": 0.823591617621225, - "kl": 0.00780487060546875, - "learning_rate": 2.919908224675412e-07, - "loss": 0.0, - "reward": 0.47625844180583954, - "reward_std": 0.4644129127264023, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.30959178507328033, + "grad_norm": 1.5024432197845348, + "kl": 0.08740234375, + "learning_rate": 4.90364957402315e-07, + "loss": 0.0001, + "reward": 0.4640047252178192, + "reward_std": 0.4288184642791748, + "rewards/correct_code_reward_func": 0.1875000074505806, + "rewards/len_reward_func": 0.2765047252178192, "step": 58 }, { - "completion_length": 111.95833587646484, + "completion_length": 73.39583396911621, "epoch": 0.944, - "grad_norm": 0.7895834198702301, - "kl": 0.01739501953125, - "learning_rate": 2.8557870956832133e-07, - "loss": 0.0, - "reward": 0.45759040117263794, - "reward_std": 0.33291806280612946, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.3325904309749603, + "grad_norm": 1.7650779542824517, + "kl": 0.10009765625, + "learning_rate": 4.899146773943373e-07, + "loss": 0.0001, + "reward": 0.5490352362394333, + "reward_std": 0.31501778960227966, + "rewards/correct_code_reward_func": 0.1875, + "rewards/len_reward_func": 0.3615352660417557, "step": 59 }, { - "completion_length": 200.64583587646484, + "completion_length": 73.18750190734863, "epoch": 0.96, - "grad_norm": 0.398008582749878, - "kl": 0.00305938720703125, - "learning_rate": 2.7914261414993976e-07, - "loss": 0.0, - "reward": 0.5845803320407867, - "reward_std": 0.3288002014160156, + "grad_norm": 0.7172587724443913, + "kl": 0.0584716796875, + "learning_rate": 4.894543310469967e-07, + "loss": 0.0001, + "reward": 0.4920351505279541, + "reward_std": 0.4270896017551422, "rewards/correct_code_reward_func": 0.2291666716337204, - "rewards/len_reward_func": 0.35541366040706635, + "rewards/len_reward_func": 0.2628684788942337, "step": 60 }, { - "completion_length": 123.75000762939453, + "completion_length": 101.14583969116211, "epoch": 0.976, - "grad_norm": 0.6258506341186686, - "kl": 0.00853729248046875, - "learning_rate": 2.726868745873286e-07, - "loss": 0.0, - "reward": 0.5053079277276993, - "reward_std": 0.45688633620738983, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.3386412411928177, + "grad_norm": 0.9692991425732275, + "kl": 0.077880859375, + "learning_rate": 4.88983937675504e-07, + "loss": 0.0001, + "reward": 0.5707934498786926, + "reward_std": 0.4804637283086777, + "rewards/correct_code_reward_func": 0.25000000558793545, + "rewards/len_reward_func": 0.3207934498786926, "step": 61 }, { - "completion_length": 178.4166717529297, + "completion_length": 90.43750381469727, "epoch": 0.992, - "grad_norm": 0.4226697044495165, - "kl": 0.004604339599609375, - "learning_rate": 2.662158424969357e-07, - "loss": 0.0, - "reward": 0.4181392341852188, - "reward_std": 0.39034655690193176, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.3139725774526596, + "grad_norm": 0.9307526215492377, + "kl": 0.0550537109375, + "learning_rate": 4.885035170166228e-07, + "loss": 0.0001, + "reward": 0.4401155561208725, + "reward_std": 0.2671549841761589, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.3359488919377327, "step": 62 }, { - "completion_length": 150.375, + "completion_length": 79.08333587646484, "epoch": 1.0, - "grad_norm": 0.4226697044495165, - "kl": 0.00823974609375, - "learning_rate": 2.597338798034344e-07, - "loss": 0.0, - "reward": 0.7250348925590515, - "reward_std": 0.3359350562095642, - "rewards/correct_code_reward_func": 0.375, - "rewards/len_reward_func": 0.3500348925590515, + "grad_norm": 0.9307526215492377, + "kl": 0.1474609375, + "learning_rate": 4.880130892278419e-07, + "loss": 0.0001, + "reward": 0.4844928979873657, + "reward_std": 0.3104479908943176, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.4011596143245697, "step": 63 }, { - "completion_length": 137.45833587646484, + "completion_length": 65.97916793823242, "epoch": 1.016, - "grad_norm": 0.621781928712815, - "kl": 0.0059356689453125, - "learning_rate": 2.532453557994827e-07, - "loss": 0.0, - "reward": 0.5079408586025238, - "reward_std": 0.43556541204452515, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.3412741720676422, + "grad_norm": 1.2251318507150297, + "kl": 0.09423828125, + "learning_rate": 4.875126748865289e-07, + "loss": 0.0001, + "reward": 0.5488586723804474, + "reward_std": 0.3761359751224518, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.3405253440141678, "step": 64 }, { - "completion_length": 237.45834350585938, + "completion_length": 80.97916793823242, "epoch": 1.032, - "grad_norm": 0.4700296188876399, - "kl": 0.00553131103515625, - "learning_rate": 2.467546442005173e-07, - "loss": 0.0, - "reward": 0.3794639855623245, - "reward_std": 0.3328210711479187, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.3169640153646469, + "grad_norm": 1.7067808615741822, + "kl": 0.0869140625, + "learning_rate": 4.870022949890676e-07, + "loss": 0.0001, + "reward": 0.47587963938713074, + "reward_std": 0.40881581604480743, + "rewards/correct_code_reward_func": 0.125, + "rewards/len_reward_func": 0.3508796691894531, "step": 65 }, { - "completion_length": 217.33333587646484, + "completion_length": 68.125, "epoch": 1.048, - "grad_norm": 0.47407157003975803, - "kl": 0.01959228515625, - "learning_rate": 2.4026612019656556e-07, - "loss": 0.0, - "reward": 0.5135317444801331, - "reward_std": 0.3767416924238205, - "rewards/correct_code_reward_func": 0.2083333395421505, - "rewards/len_reward_func": 0.30519840121269226, + "grad_norm": 0.9894209736736024, + "kl": 0.08544921875, + "learning_rate": 4.864819709499761e-07, + "loss": 0.0001, + "reward": 0.5112280547618866, + "reward_std": 0.39189808815717697, + "rewards/correct_code_reward_func": 0.1666666679084301, + "rewards/len_reward_func": 0.3445614129304886, "step": 66 }, { - "completion_length": 159.9791717529297, + "completion_length": 82.16666793823242, "epoch": 1.064, - "grad_norm": 1.33408581993753, - "kl": 0.004638671875, - "learning_rate": 2.337841575030642e-07, - "loss": 0.0, - "reward": 0.5043017268180847, - "reward_std": 0.5010640621185303, - "rewards/correct_code_reward_func": 0.1875000111758709, - "rewards/len_reward_func": 0.3168017417192459, + "grad_norm": 1.0146554350650319, + "kl": 0.13720703125, + "learning_rate": 4.85951724601009e-07, + "loss": 0.0001, + "reward": 0.5718176364898682, + "reward_std": 0.41909658908843994, + "rewards/correct_code_reward_func": 0.2500000074505806, + "rewards/len_reward_func": 0.32181763648986816, "step": 67 }, { - "completion_length": 196.06250762939453, + "completion_length": 67.18750190734863, "epoch": 1.08, - "grad_norm": 0.39338975551762256, - "kl": 0.0045166015625, - "learning_rate": 2.2731312541267143e-07, - "loss": 0.0, - "reward": 0.40096263587474823, - "reward_std": 0.38369233906269073, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.29679596424102783, + "grad_norm": 1.209065945657589, + "kl": 0.13037109375, + "learning_rate": 4.854115781902414e-07, + "loss": 0.0001, + "reward": 0.38857993483543396, + "reward_std": 0.37996064126491547, + "rewards/correct_code_reward_func": 0.0625, + "rewards/len_reward_func": 0.32607994973659515, "step": 68 }, { - "completion_length": 168.20833587646484, + "completion_length": 63.9375, "epoch": 1.096, - "grad_norm": 0.5733635012795789, - "kl": 0.011474609375, - "learning_rate": 2.2085738585006021e-07, - "loss": 0.0, - "reward": 0.5634751617908478, - "reward_std": 0.42181093990802765, - "rewards/correct_code_reward_func": 0.229166679084301, - "rewards/len_reward_func": 0.3343084752559662, + "grad_norm": 1.0015586737181452, + "kl": 0.121337890625, + "learning_rate": 4.848615543811344e-07, + "loss": 0.0001, + "reward": 0.6716190874576569, + "reward_std": 0.3675233870744705, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.37995241582393646, "step": 69 }, { - "completion_length": 131.62500762939453, + "completion_length": 45.00000190734863, "epoch": 1.112, - "grad_norm": 0.5768348851420809, - "kl": 0.0180511474609375, - "learning_rate": 2.1442129043167873e-07, - "loss": 0.0, - "reward": 0.44008754193782806, - "reward_std": 0.4175649434328079, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.29425420612096786, + "grad_norm": 1.3746237501719711, + "kl": 0.166015625, + "learning_rate": 4.843016762515859e-07, + "loss": 0.0002, + "reward": 0.38038890063762665, + "reward_std": 0.32317543029785156, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.33872224390506744, "step": 70 }, { - "completion_length": 178.1041717529297, + "completion_length": 57.395835876464844, "epoch": 1.1280000000000001, - "grad_norm": 0.8562478784181595, - "kl": 0.01507568359375, - "learning_rate": 2.0800917753245875e-07, - "loss": 0.0, - "reward": 0.42609627544879913, - "reward_std": 0.395632266998291, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.34276294708251953, + "grad_norm": 2.2658811424218266, + "kl": 0.086669921875, + "learning_rate": 4.837319672929606e-07, + "loss": 0.0001, + "reward": 0.4675743877887726, + "reward_std": 0.29566246271133423, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.363407701253891, "step": 71 }, { - "completion_length": 142.43750381469727, + "completion_length": 39.66666793823242, "epoch": 1.144, - "grad_norm": 0.5850086202761346, - "kl": 0.007049560546875, - "learning_rate": 2.0162536936145008e-07, - "loss": 0.0, - "reward": 0.6273844540119171, - "reward_std": 0.37064287066459656, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.3565511107444763, + "grad_norm": 0.9853509970570241, + "kl": 0.1171875, + "learning_rate": 4.831524514091056e-07, + "loss": 0.0001, + "reward": 0.5477184951305389, + "reward_std": 0.4072943925857544, + "rewards/correct_code_reward_func": 0.1875000074505806, + "rewards/len_reward_func": 0.36021851003170013, "step": 72 }, { - "completion_length": 173.4166717529297, + "completion_length": 40.10416793823242, "epoch": 1.16, - "grad_norm": 1.0060077647295973, - "kl": 0.01995849609375, - "learning_rate": 1.9527416904835132e-07, - "loss": 0.0, - "reward": 0.6327731013298035, - "reward_std": 0.40126484632492065, - "rewards/correct_code_reward_func": 0.3125, - "rewards/len_reward_func": 0.3202730864286423, + "grad_norm": 2.4194323860380593, + "kl": 0.18359375, + "learning_rate": 4.825631529153466e-07, + "loss": 0.0002, + "reward": 0.49018459022045135, + "reward_std": 0.25917236506938934, + "rewards/correct_code_reward_func": 0.16666667722165585, + "rewards/len_reward_func": 0.32351788878440857, "step": 73 }, { - "completion_length": 196.5416717529297, + "completion_length": 29.562500953674316, "epoch": 1.176, - "grad_norm": 0.5247248856581095, - "kl": 0.004150390625, - "learning_rate": 1.889598577429022e-07, - "loss": 0.0, - "reward": 0.4798154681921005, - "reward_std": 0.4057523310184479, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.33398209512233734, + "grad_norm": 1.687397775503762, + "kl": 0.40771484375, + "learning_rate": 4.81964096537468e-07, + "loss": 0.0004, + "reward": 0.43547162413597107, + "reward_std": 0.2647310718894005, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.39380495250225067, "step": 74 }, { - "completion_length": 163.37500762939453, + "completion_length": 40.16666793823242, "epoch": 1.192, - "grad_norm": 0.44940299985092946, - "kl": 0.00665283203125, - "learning_rate": 1.8268669172909136e-07, - "loss": 0.0, - "reward": 0.6056158542633057, - "reward_std": 0.4253086894750595, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.3347824960947037, + "grad_norm": 1.1816855995040336, + "kl": 0.146484375, + "learning_rate": 4.81355307410676e-07, + "loss": 0.0001, + "reward": 0.5548797100782394, + "reward_std": 0.3682420402765274, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.3465464115142822, "step": 75 }, { - "completion_length": 127.60417175292969, + "completion_length": 93.93750190734863, "epoch": 1.208, - "grad_norm": 0.5154656852815648, - "kl": 0.0311279296875, - "learning_rate": 1.7645889955612592e-07, - "loss": 0.0, - "reward": 0.5518045127391815, - "reward_std": 0.46641653776168823, - "rewards/correct_code_reward_func": 0.2083333358168602, - "rewards/len_reward_func": 0.3434711843729019, + "grad_norm": 1.2669964512881071, + "kl": 0.2099609375, + "learning_rate": 4.80736811078543e-07, + "loss": 0.0002, + "reward": 0.4997504949569702, + "reward_std": 0.3275124281644821, + "rewards/correct_code_reward_func": 0.12500000558793545, + "rewards/len_reward_func": 0.3747505098581314, "step": 76 }, { - "completion_length": 126.47917175292969, + "completion_length": 36.79166793823242, "epoch": 1.224, - "grad_norm": 0.7328993764471904, - "kl": 0.01027679443359375, - "learning_rate": 1.7028067918809535e-07, - "loss": 0.0, - "reward": 0.5090649425983429, - "reward_std": 0.34914855659008026, - "rewards/correct_code_reward_func": 0.20833333395421505, - "rewards/len_reward_func": 0.3007315993309021, + "grad_norm": 1.1317327261317542, + "kl": 0.153564453125, + "learning_rate": 4.80108633491936e-07, + "loss": 0.0002, + "reward": 0.43036508560180664, + "reward_std": 0.27128875255584717, + "rewards/correct_code_reward_func": 0.1250000037252903, + "rewards/len_reward_func": 0.30536508560180664, "step": 77 }, { - "completion_length": 137.2291717529297, + "completion_length": 53.70833396911621, "epoch": 1.24, - "grad_norm": 0.7418796199300344, - "kl": 0.009124755859375, - "learning_rate": 1.6415619517425294e-07, - "loss": 0.0, - "reward": 0.578598827123642, - "reward_std": 0.4721776694059372, - "rewards/correct_code_reward_func": 0.2500000074505806, - "rewards/len_reward_func": 0.32859882712364197, + "grad_norm": 1.5731589655579585, + "kl": 0.19970703125, + "learning_rate": 4.794708010079288e-07, + "loss": 0.0002, + "reward": 0.5270055830478668, + "reward_std": 0.35258112847805023, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.36033889651298523, "step": 78 }, { - "completion_length": 158.95833587646484, + "completion_length": 44.37500190734863, "epoch": 1.256, - "grad_norm": 2.068496131846233, - "kl": 0.02191162109375, - "learning_rate": 1.5808957584181994e-07, - "loss": 0.0, - "reward": 0.4869799315929413, - "reward_std": 0.4040430933237076, - "rewards/correct_code_reward_func": 0.1666666679084301, - "rewards/len_reward_func": 0.3203132748603821, + "grad_norm": 1.0496022486728145, + "kl": 0.220703125, + "learning_rate": 4.788233403886949e-07, + "loss": 0.0002, + "reward": 0.48745329678058624, + "reward_std": 0.1396162062883377, + "rewards/correct_code_reward_func": 0.06250000186264515, + "rewards/len_reward_func": 0.42495329678058624, "step": 79 }, { - "completion_length": 106.93750381469727, + "completion_length": 27.291666984558105, "epoch": 1.272, - "grad_norm": 0.5762356445650323, - "kl": 0.00872802734375, - "learning_rate": 1.5208491051320744e-07, - "loss": 0.0, - "reward": 0.5087297856807709, - "reward_std": 0.39580225944519043, - "rewards/correct_code_reward_func": 0.1875000074505806, - "rewards/len_reward_func": 0.3212297558784485, + "grad_norm": 1.0530921922250815, + "kl": 0.216796875, + "learning_rate": 4.78166278800385e-07, + "loss": 0.0002, + "reward": 0.579190120100975, + "reward_std": 0.34055350720882416, + "rewards/correct_code_reward_func": 0.2083333395421505, + "rewards/len_reward_func": 0.37085679173469543, "step": 80 }, { - "completion_length": 114.00000381469727, + "completion_length": 32.50000190734863, "epoch": 1.288, - "grad_norm": 0.786665682978904, - "kl": 0.013519287109375, - "learning_rate": 1.461462467495284e-07, - "loss": 0.0, - "reward": 0.4894479066133499, - "reward_std": 0.3821127265691757, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.3644479066133499, + "grad_norm": 1.18045709904702, + "kl": 0.19189453125, + "learning_rate": 4.774996438119876e-07, + "loss": 0.0002, + "reward": 0.6207249760627747, + "reward_std": 0.377775639295578, + "rewards/correct_code_reward_func": 0.1875, + "rewards/len_reward_func": 0.43322494626045227, "step": 81 }, { - "completion_length": 154.83333587646484, + "completion_length": 23.979166984558105, "epoch": 1.304, - "grad_norm": 0.7107423115065932, - "kl": 0.0141754150390625, - "learning_rate": 1.4027758762226107e-07, - "loss": 0.0, - "reward": 0.4811897426843643, - "reward_std": 0.3424055427312851, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.3770230710506439, + "grad_norm": 1.6753165605296896, + "kl": 0.21875, + "learning_rate": 4.7682346339417157e-07, + "loss": 0.0002, + "reward": 0.5133648067712784, + "reward_std": 0.12423056736588478, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.471698135137558, "step": 82 }, { - "completion_length": 235.70834350585938, + "completion_length": 41.10416793823242, "epoch": 1.32, - "grad_norm": 0.43515005998860457, - "kl": 0.0047760009765625, - "learning_rate": 1.3448288901490092e-07, - "loss": 0.0, - "reward": 0.4617680013179779, - "reward_std": 0.3315645009279251, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.3367680013179779, + "grad_norm": 1.5309365229407714, + "kl": 0.23876953125, + "learning_rate": 4.7613776591811295e-07, + "loss": 0.0002, + "reward": 0.5157457143068314, + "reward_std": 0.17486733943223953, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.41157902777194977, "step": 83 }, { - "completion_length": 143.1666717529297, + "completion_length": 24.70833396911621, "epoch": 1.336, - "grad_norm": 0.65180392684865, - "kl": 0.0086669921875, - "learning_rate": 1.2876605695642084e-07, - "loss": 0.0, - "reward": 0.4678248018026352, - "reward_std": 0.3193260580301285, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.3636581301689148, + "grad_norm": 1.2440691306439626, + "kl": 0.388671875, + "learning_rate": 4.754425801543046e-07, + "loss": 0.0004, + "reward": 0.46331432461738586, + "reward_std": 0.17115781363099813, + "rewards/correct_code_reward_func": 0.06250000186264515, + "rewards/len_reward_func": 0.4008142799139023, "step": 84 }, { - "completion_length": 131.8333396911621, + "completion_length": 40.08333396911621, "epoch": 1.3519999999999999, - "grad_norm": 0.4416459826072822, - "kl": 0.01190185546875, - "learning_rate": 1.231309449883361e-07, - "loss": 0.0, - "reward": 0.41201435029506683, - "reward_std": 0.3407471626996994, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.37034766376018524, + "grad_norm": 0.9581765714522964, + "kl": 0.18310546875, + "learning_rate": 4.747379352713488e-07, + "loss": 0.0002, + "reward": 0.5128517299890518, + "reward_std": 0.13994156941771507, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.4086850583553314, "step": 85 }, { - "completion_length": 103.77083969116211, + "completion_length": 28.6875, "epoch": 1.3679999999999999, - "grad_norm": 0.43361682764693815, - "kl": 0.0066986083984375, - "learning_rate": 1.1758135156715041e-07, - "loss": 0.0, - "reward": 0.423152431845665, - "reward_std": 0.3362526297569275, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.3189857602119446, + "grad_norm": 1.5448289083630962, + "kl": 0.203125, + "learning_rate": 4.7402386083473364e-07, + "loss": 0.0002, + "reward": 0.6120259165763855, + "reward_std": 0.24722883105278015, + "rewards/correct_code_reward_func": 0.1875, + "rewards/len_reward_func": 0.4245258867740631, "step": 86 }, { - "completion_length": 176.375, + "completion_length": 44.75, "epoch": 1.384, - "grad_norm": 1.352523990215239, - "kl": 0.03180694580078125, - "learning_rate": 1.1212101750393235e-07, - "loss": 0.0, - "reward": 0.4561140537261963, - "reward_std": 0.39378371834754944, + "grad_norm": 1.3501042765686608, + "kl": 0.1884765625, + "learning_rate": 4.7330038680559224e-07, + "loss": 0.0002, + "reward": 0.4797859787940979, + "reward_std": 0.336033895611763, "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.3519473969936371, + "rewards/len_reward_func": 0.3756193071603775, "step": 87 }, { - "completion_length": 79.97916793823242, + "completion_length": 21.354166984558105, "epoch": 1.4, - "grad_norm": 2.141260004886994, - "kl": 0.0297698974609375, - "learning_rate": 1.0675362344274952e-07, - "loss": 0.0, - "reward": 0.42587006092071533, - "reward_std": 0.4436161369085312, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.30087001621723175, + "grad_norm": 1.4869496350604456, + "kl": 0.30322265625, + "learning_rate": 4.72567543539446e-07, + "loss": 0.0003, + "reward": 0.4333198815584183, + "reward_std": 0.28416211903095245, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.3916532099246979, "step": 88 }, { - "completion_length": 107.37500381469727, + "completion_length": 27.58333396911621, "epoch": 1.416, - "grad_norm": 0.4125343610721507, - "kl": 0.017974853515625, - "learning_rate": 1.0148278737965844e-07, - "loss": 0.0, - "reward": 0.5323555767536163, - "reward_std": 0.4073493778705597, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.36568886041641235, + "grad_norm": 1.6228999757156357, + "kl": 0.3466796875, + "learning_rate": 4.718253617849305e-07, + "loss": 0.0003, + "reward": 0.5190460979938507, + "reward_std": 0.24678421020507812, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.4148794263601303, "step": 89 }, { - "completion_length": 87.83333587646484, + "completion_length": 30.895834922790527, "epoch": 1.432, - "grad_norm": 1.3465092335562199, - "kl": 0.0328369140625, - "learning_rate": 9.631206222392479e-08, - "loss": 0.0, - "reward": 0.5186317265033722, - "reward_std": 0.4107673317193985, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.3519650846719742, + "grad_norm": 1.3213740664946243, + "kl": 0.2158203125, + "learning_rate": 4.7107387268250586e-07, + "loss": 0.0002, + "reward": 0.5004410147666931, + "reward_std": 0.31192412972450256, + "rewards/correct_code_reward_func": 0.1458333395421505, + "rewards/len_reward_func": 0.3546076714992523, "step": 90 }, { - "completion_length": 124.33333969116211, + "completion_length": 37.60416793823242, "epoch": 1.448, - "grad_norm": 0.7479671719828264, - "kl": 0.02008056640625, - "learning_rate": 9.124493340311537e-08, - "loss": 0.0, - "reward": 0.5086182951927185, - "reward_std": 0.12347583472728729, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.4252849221229553, + "grad_norm": 1.579120317520106, + "kl": 0.25048828125, + "learning_rate": 4.703131077631497e-07, + "loss": 0.0002, + "reward": 0.6014063358306885, + "reward_std": 0.07078037410974503, + "rewards/correct_code_reward_func": 0.1458333432674408, + "rewards/len_reward_func": 0.4555730074644089, "step": 91 }, { - "completion_length": 167.79166793823242, + "completion_length": 24.229166984558105, "epoch": 1.464, - "grad_norm": 1.1543274217148969, - "kl": 0.0083770751953125, - "learning_rate": 8.628481651367875e-08, - "loss": 0.0, - "reward": 0.6109435856342316, - "reward_std": 0.422005370259285, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.34011024236679077, + "grad_norm": 0.7584124455777198, + "kl": 0.248046875, + "learning_rate": 4.6954309894703426e-07, + "loss": 0.0002, + "reward": 0.4590907543897629, + "reward_std": 0.16932503879070282, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.4174240827560425, "step": 92 }, { - "completion_length": 115.33333969116211, + "completion_length": 30.64583396911621, "epoch": 1.48, - "grad_norm": 0.3931440618018204, - "kl": 0.0080108642578125, - "learning_rate": 8.143505501859551e-08, - "loss": 0.0, - "reward": 0.44855794310569763, - "reward_std": 0.32760028541088104, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.3235579580068588, + "grad_norm": 0.0060252039125188945, + "kl": 0.17919921875, + "learning_rate": 4.6876387854218744e-07, + "loss": 0.0001, + "reward": 0.46853742003440857, + "reward_std": 0.08898964250827746, + "rewards/correct_code_reward_func": 0.06250000186264515, + "rewards/len_reward_func": 0.40603742003440857, "step": 93 }, { - "completion_length": 151.39583587646484, + "completion_length": 36.25000190734863, "epoch": 1.496, - "grad_norm": 0.5894110998499853, - "kl": 0.009307861328125, - "learning_rate": 7.669891799365282e-08, - "loss": 0.0, - "reward": 0.36864979565143585, - "reward_std": 0.36151623725891113, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.32698309421539307, + "grad_norm": 0.8958155592124936, + "kl": 0.19775390625, + "learning_rate": 4.6797547924313673e-07, + "loss": 0.0002, + "reward": 0.44910070300102234, + "reward_std": 0.24745193123817444, + "rewards/correct_code_reward_func": 0.02083333395421505, + "rewards/len_reward_func": 0.42826738953590393, "step": 94 }, { - "completion_length": 147.7916717529297, + "completion_length": 40.35416793823242, "epoch": 1.512, - "grad_norm": 0.5972288037218009, - "kl": 0.019500732421875, - "learning_rate": 7.207959792385998e-08, - "loss": 0.0, - "reward": 0.4600509703159332, - "reward_std": 0.4011112302541733, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.3558843284845352, + "grad_norm": 1.5558035278415578, + "kl": 0.21337890625, + "learning_rate": 4.6717793412953776e-07, + "loss": 0.0002, + "reward": 0.44604572653770447, + "reward_std": 0.30530666559934616, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.40437906980514526, "step": 95 }, { - "completion_length": 149.9791717529297, + "completion_length": 35.291666984558105, "epoch": 1.528, - "grad_norm": 0.86174582631064, - "kl": 0.00836181640625, - "learning_rate": 6.758020855149249e-08, - "loss": 0.0, - "reward": 0.44805125892162323, - "reward_std": 0.33948560059070587, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.32305125892162323, + "grad_norm": 1.4321261431714865, + "kl": 0.39892578125, + "learning_rate": 4.6637127666478617e-07, + "loss": 0.0004, + "reward": 0.46982091665267944, + "reward_std": 0.08393960446119308, + "rewards/correct_code_reward_func": 0.02083333395421505, + "rewards/len_reward_func": 0.44898758828639984, "step": 96 }, { - "completion_length": 163.6041717529297, + "completion_length": 24.354166984558105, "epoch": 1.544, - "grad_norm": 0.36337651628903067, - "kl": 0.0181121826171875, - "learning_rate": 6.320378277721342e-08, - "loss": 0.0, - "reward": 0.4679824113845825, - "reward_std": 0.4093552529811859, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.3638157695531845, + "grad_norm": 0.929795596193928, + "kl": 0.29443359375, + "learning_rate": 4.6555554069461346e-07, + "loss": 0.0003, + "reward": 0.46133650839328766, + "reward_std": 0.2364010475575924, + "rewards/correct_code_reward_func": 0.0625, + "rewards/len_reward_func": 0.39883650839328766, "step": 97 }, { - "completion_length": 102.75000381469727, + "completion_length": 28.95833396911621, "epoch": 1.56, - "grad_norm": 1.175706006208444, - "kl": 0.023834228515625, - "learning_rate": 5.895327061568775e-08, - "loss": 0.0, - "reward": 0.5549444258213043, - "reward_std": 0.43229806423187256, - "rewards/correct_code_reward_func": 0.2500000111758709, - "rewards/len_reward_func": 0.3049444109201431, + "grad_norm": 0.6658422780899015, + "kl": 0.31982421875, + "learning_rate": 4.647307604456674e-07, + "loss": 0.0003, + "reward": 0.5673407018184662, + "reward_std": 0.12253111600875854, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.4631740301847458, "step": 98 }, { - "completion_length": 144.58333587646484, + "completion_length": 15.770833969116211, "epoch": 1.576, - "grad_norm": 0.7970883391032116, - "kl": 0.011474609375, - "learning_rate": 5.483153720706798e-08, - "loss": 0.0, - "reward": 0.5082628130912781, - "reward_std": 0.46417203545570374, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.2999294698238373, + "grad_norm": 2.493104620020484, + "kl": 0.4345703125, + "learning_rate": 4.6389697052407526e-07, + "loss": 0.0004, + "reward": 0.591472789645195, + "reward_std": 0.3403037488460541, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.4248061031103134, "step": 99 }, { - "completion_length": 93.41667175292969, + "completion_length": 18.000000953674316, "epoch": 1.592, - "grad_norm": 0.577298607650083, - "kl": 0.0172119140625, - "learning_rate": 5.0841360885690996e-08, - "loss": 0.0, - "reward": 0.43817608058452606, - "reward_std": 0.4479677081108093, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.29234276711940765, + "grad_norm": 1.652920479976726, + "kl": 0.48828125, + "learning_rate": 4.630542059139923e-07, + "loss": 0.0005, + "reward": 0.4733283668756485, + "reward_std": 0.07313242554664612, + "rewards/correct_code_reward_func": 0.0, + "rewards/len_reward_func": 0.4733283668756485, "step": 100 }, { - "completion_length": 112.79167175292969, + "completion_length": 24.104166984558105, "epoch": 1.608, - "grad_norm": 0.6328737919939129, - "kl": 0.0224151611328125, - "learning_rate": 4.698543130728755e-08, - "loss": 0.0, - "reward": 0.6780606508255005, - "reward_std": 0.37215377390384674, - "rewards/correct_code_reward_func": 0.3333333358168602, - "rewards/len_reward_func": 0.3447272926568985, + "grad_norm": 0.6637104853285745, + "kl": 0.3203125, + "learning_rate": 4.622025019761336e-07, + "loss": 0.0003, + "reward": 0.6387104988098145, + "reward_std": 0.21521971747279167, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.43037715554237366, "step": 101 }, { - "completion_length": 125.33333587646484, + "completion_length": 23.687500953674316, "epoch": 1.624, - "grad_norm": 0.8805964414937946, - "kl": 0.007904052734375, - "learning_rate": 4.326634763596784e-08, - "loss": 0.0, - "reward": 0.5231586992740631, - "reward_std": 0.39199909567832947, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.3981587141752243, + "grad_norm": 1.096819799191345, + "kl": 0.3330078125, + "learning_rate": 4.613418944462906e-07, + "loss": 0.0003, + "reward": 0.5115740746259689, + "reward_std": 0.20615240186452866, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.42824074625968933, "step": 102 }, { - "completion_length": 143.81250762939453, + "completion_length": 18.291667938232422, "epoch": 1.6400000000000001, - "grad_norm": 0.4566354927971138, - "kl": 0.0088348388671875, - "learning_rate": 3.968661679220467e-08, - "loss": 0.0, - "reward": 0.5204833149909973, - "reward_std": 0.25895993411540985, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4163166582584381, + "grad_norm": 0.7425841429065823, + "kl": 0.4208984375, + "learning_rate": 4.6047241943383173e-07, + "loss": 0.0004, + "reward": 0.5208333432674408, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.06250000186264515, + "rewards/len_reward_func": 0.4583333432674408, "step": 103 }, { - "completion_length": 118.27083587646484, + "completion_length": 20.1875, "epoch": 1.6560000000000001, - "grad_norm": 0.41571642382372326, - "kl": 0.00982666015625, - "learning_rate": 3.624865176299499e-08, - "loss": 0.0, - "reward": 0.6309479027986526, - "reward_std": 0.4369208961725235, - "rewards/correct_code_reward_func": 0.29166667722165585, - "rewards/len_reward_func": 0.33928124606609344, + "grad_norm": 1.0672134971206753, + "kl": 0.2548828125, + "learning_rate": 4.5959411342018704e-07, + "loss": 0.0003, + "reward": 0.5686589479446411, + "reward_std": 0.1569155752658844, + "rewards/correct_code_reward_func": 0.1458333432674408, + "rewards/len_reward_func": 0.42282557487487793, "step": 104 }, { - "completion_length": 127.50000381469727, + "completion_length": 18.89583396911621, "epoch": 1.6720000000000002, - "grad_norm": 0.44810915489413017, - "kl": 0.021331787109375, - "learning_rate": 3.295476997533905e-08, - "loss": 0.0, - "reward": 0.43531325459480286, - "reward_std": 0.2511162757873535, + "grad_norm": 1.9534809364444377, + "kl": 0.4580078125, + "learning_rate": 4.5870701325731773e-07, + "loss": 0.0005, + "reward": 0.5205335766077042, + "reward_std": 0.14886049553751945, "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.39364662766456604, + "rewards/len_reward_func": 0.47886690497398376, "step": 105 }, { - "completion_length": 179.75000762939453, + "completion_length": 30.854167938232422, "epoch": 1.688, - "grad_norm": 0.5235689630320457, - "kl": 0.029052734375, - "learning_rate": 2.980719173413396e-08, - "loss": 0.0, - "reward": 0.354349747300148, - "reward_std": 0.2833855152130127, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.3126830607652664, + "grad_norm": 1.886180541460537, + "kl": 0.2861328125, + "learning_rate": 4.578111561661702e-07, + "loss": 0.0003, + "reward": 0.5625, + "reward_std": 0.1753452718257904, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.4791666716337204, "step": 106 }, { - "completion_length": 120.50000381469727, + "completion_length": 22.979166984558105, "epoch": 1.704, - "grad_norm": 1.2523814931919, - "kl": 0.0085601806640625, - "learning_rate": 2.680803872553408e-08, - "loss": 0.0, - "reward": 0.48751458525657654, - "reward_std": 0.4157916307449341, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.32084792852401733, + "grad_norm": 1.3514702193989006, + "kl": 0.3154296875, + "learning_rate": 4.569065797351135e-07, + "loss": 0.0003, + "reward": 0.614065021276474, + "reward_std": 0.30389876663684845, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.4057316929101944, "step": 107 }, { - "completion_length": 151.16666793823242, + "completion_length": 22.14583396911621, "epoch": 1.72, - "grad_norm": 0.6447932351262547, - "kl": 0.00946807861328125, - "learning_rate": 2.395933258678745e-08, - "loss": 0.0, - "reward": 0.6216467022895813, - "reward_std": 0.44435153901576996, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.3508133441209793, + "grad_norm": 1.3204173636971241, + "kl": 0.359375, + "learning_rate": 4.559933219183631e-07, + "loss": 0.0004, + "reward": 0.5397135615348816, + "reward_std": 0.11889010295271873, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.498046875, "step": 108 }, { - "completion_length": 123.4375, + "completion_length": 16.000000476837158, "epoch": 1.736, - "grad_norm": 0.5052351271825365, - "kl": 0.0091552734375, - "learning_rate": 2.1262993543511715e-08, - "loss": 0.0, - "reward": 0.6645375192165375, - "reward_std": 0.4121406674385071, - "rewards/correct_code_reward_func": 0.2916666716337204, - "rewards/len_reward_func": 0.37287086248397827, + "grad_norm": 1.358034882182442, + "kl": 0.4677734375, + "learning_rate": 4.550714210343879e-07, + "loss": 0.0005, + "reward": 0.7275510132312775, + "reward_std": 0.3622310161590576, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.39421766996383667, "step": 109 }, { - "completion_length": 153.47916793823242, + "completion_length": 23.14583396911621, "epoch": 1.752, - "grad_norm": 0.6369640119331228, - "kl": 0.0086212158203125, - "learning_rate": 1.872083911532907e-08, - "loss": 0.0, - "reward": 0.40650297701358795, - "reward_std": 0.27610746026039124, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.36483629047870636, + "grad_norm": 1.916868694407283, + "kl": 0.3154296875, + "learning_rate": 4.541409157643027e-07, + "loss": 0.0003, + "reward": 0.4378484785556793, + "reward_std": 0.12258240953087807, + "rewards/correct_code_reward_func": 0.0, + "rewards/len_reward_func": 0.4378484785556793, "step": 110 }, { - "completion_length": 132.1041717529297, + "completion_length": 13.75, "epoch": 1.768, - "grad_norm": 0.44311314663042145, - "kl": 0.01031494140625, - "learning_rate": 1.6334582890731697e-08, - "loss": 0.0, - "reward": 0.7341299653053284, - "reward_std": 0.504623532295227, - "rewards/correct_code_reward_func": 0.4375, - "rewards/len_reward_func": 0.29662999510765076, + "grad_norm": 1.6380973978730606, + "kl": 0.513671875, + "learning_rate": 4.5320184515024493e-07, + "loss": 0.0005, + "reward": 0.6193452775478363, + "reward_std": 0.22009535133838654, + "rewards/correct_code_reward_func": 0.125, + "rewards/len_reward_func": 0.4943452626466751, "step": 111 }, { - "completion_length": 162.0625, + "completion_length": 39.854166984558105, "epoch": 1.784, - "grad_norm": 0.6586562319299408, - "kl": 0.0101318359375, - "learning_rate": 1.4105833372004523e-08, - "loss": 0.0, - "reward": 0.4220695346593857, - "reward_std": 0.2905489057302475, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.3804028630256653, + "grad_norm": 3.8105256272979995, + "kl": 0.2919921875, + "learning_rate": 4.5225424859373684e-07, + "loss": 0.0003, + "reward": 0.523853987455368, + "reward_std": 0.16823304444551468, + "rewards/correct_code_reward_func": 0.06250000186264515, + "rewards/len_reward_func": 0.46135397255420685, "step": 112 }, { - "completion_length": 106.89583587646484, + "completion_length": 15.291666984558105, "epoch": 1.8, - "grad_norm": 0.7764527703399275, - "kl": 0.019287109375, - "learning_rate": 1.2036092890982619e-08, - "loss": 0.0, - "reward": 0.5292025506496429, - "reward_std": 0.3711909055709839, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.38336920738220215, + "grad_norm": 1.1650221360198958, + "kl": 0.541015625, + "learning_rate": 4.51298165854032e-07, + "loss": 0.0005, + "reward": 0.5442352294921875, + "reward_std": 0.06456775963306427, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.4609019011259079, "step": 113 }, { - "completion_length": 132.2916717529297, + "completion_length": 20.625, "epoch": 1.8159999999999998, - "grad_norm": 0.6064692969361543, - "kl": 0.009307861328125, - "learning_rate": 1.0126756596375685e-08, - "loss": 0.0, - "reward": 0.48789724707603455, - "reward_std": 0.3553258925676346, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.34206391870975494, + "grad_norm": 0.020013783247992148, + "kl": 0.3828125, + "learning_rate": 4.503336370464475e-07, + "loss": 0.0004, + "reward": 0.5208333432674408, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.4791666716337204, "step": 114 }, { - "completion_length": 120.22917175292969, + "completion_length": 18.437500953674316, "epoch": 1.8319999999999999, - "grad_norm": 0.3843336433099292, - "kl": 0.0174560546875, - "learning_rate": 8.379111513340753e-09, - "loss": 0.0, - "reward": 0.47604209184646606, - "reward_std": 0.3304741531610489, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.39270876348018646, + "grad_norm": 0.8327328617247051, + "kl": 0.396484375, + "learning_rate": 4.4936070264068016e-07, + "loss": 0.0004, + "reward": 0.6041666716337204, + "reward_std": 0.2620653882622719, + "rewards/correct_code_reward_func": 0.1875, + "rewards/len_reward_func": 0.4166666716337204, "step": 115 }, { - "completion_length": 145.31250762939453, + "completion_length": 23.64583396911621, "epoch": 1.8479999999999999, - "grad_norm": 0.5532480666977985, - "kl": 0.024169921875, - "learning_rate": 6.7943356759381785e-09, - "loss": 0.0, - "reward": 0.46019650995731354, - "reward_std": 0.3715391010046005, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.35602983832359314, + "grad_norm": 1.683115986414425, + "kl": 0.34375, + "learning_rate": 4.4837940345910917e-07, + "loss": 0.0003, + "reward": 0.539160430431366, + "reward_std": 0.08424048312008381, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.45582708716392517, "step": 116 }, { - "completion_length": 149.93750381469727, + "completion_length": 17.479166984558105, "epoch": 1.8639999999999999, - "grad_norm": 1.0372718715130624, - "kl": 0.00921630859375, - "learning_rate": 5.373497333054616e-09, - "loss": 0.0, - "reward": 0.4656513184309006, - "reward_std": 0.44700081646442413, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.29898466169834137, + "grad_norm": 0.9874831743158241, + "kl": 0.3681640625, + "learning_rate": 4.473897806750828e-07, + "loss": 0.0003, + "reward": 0.47979801893234253, + "reward_std": 0.1770581193268299, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.39646467566490173, "step": 117 }, { - "completion_length": 144.89583587646484, + "completion_length": 17.791666984558105, "epoch": 1.88, - "grad_norm": 0.4882633725665408, - "kl": 0.014007568359375, - "learning_rate": 4.117554228329406e-09, - "loss": 0.0, - "reward": 0.5118084400892258, - "reward_std": 0.34739528596401215, - "rewards/correct_code_reward_func": 0.1875000074505806, - "rewards/len_reward_func": 0.32430844008922577, + "grad_norm": 1.4038513263647607, + "kl": 0.3759765625, + "learning_rate": 4.4639187581119116e-07, + "loss": 0.0004, + "reward": 0.6913580298423767, + "reward_std": 0.26719751954078674, + "rewards/correct_code_reward_func": 0.2500000074505806, + "rewards/len_reward_func": 0.4413580149412155, "step": 118 }, { - "completion_length": 97.45833587646484, + "completion_length": 19.08333396911621, "epoch": 1.896, - "grad_norm": 0.4820971241858704, - "kl": 0.01251220703125, - "learning_rate": 3.0273529545687125e-09, - "loss": 0.0, - "reward": 0.4883972406387329, - "reward_std": 0.43138815462589264, - "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.3633972257375717, + "grad_norm": 2.278605940745065, + "kl": 0.3603515625, + "learning_rate": 4.453857307375236e-07, + "loss": 0.0004, + "reward": 0.5320361256599426, + "reward_std": 0.10937795042991638, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.4903694689273834, "step": 119 }, { - "completion_length": 120.95833587646484, + "completion_length": 20.166666984558105, "epoch": 1.912, - "grad_norm": 0.6000792918829387, - "kl": 0.02130126953125, - "learning_rate": 2.1036283830834224e-09, - "loss": 0.0, - "reward": 0.6599289178848267, - "reward_std": 0.32202909141778946, - "rewards/correct_code_reward_func": 0.2291666716337204, - "rewards/len_reward_func": 0.4307622164487839, + "grad_norm": 0.010837700850525291, + "kl": 0.326171875, + "learning_rate": 4.443713876699123e-07, + "loss": 0.0003, + "reward": 0.6041666865348816, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.1041666716337204, + "rewards/len_reward_func": 0.5, "step": 120 }, { - "completion_length": 161.4791717529297, + "completion_length": 18.916666984558105, "epoch": 1.928, - "grad_norm": 0.9160239570442854, - "kl": 0.040740966796875, - "learning_rate": 1.347003168334665e-09, - "loss": 0.0, - "reward": 0.6658598780632019, - "reward_std": 0.45104770362377167, - "rewards/correct_code_reward_func": 0.3750000149011612, - "rewards/len_reward_func": 0.2908598631620407, + "grad_norm": 4.137929355426566, + "kl": 0.3173828125, + "learning_rate": 4.433488891681609e-07, + "loss": 0.0003, + "reward": 0.7551863789558411, + "reward_std": 0.3335232138633728, + "rewards/correct_code_reward_func": 0.3125000149011612, + "rewards/len_reward_func": 0.44268636405467987, "step": 121 }, { - "completion_length": 197.33333587646484, + "completion_length": 30.500001907348633, "epoch": 1.944, - "grad_norm": 1.2887565831293122, - "kl": 0.0357666015625, - "learning_rate": 7.579873282216598e-10, - "loss": 0.0, - "reward": 0.5509243905544281, - "reward_std": 0.42525260150432587, - "rewards/correct_code_reward_func": 0.229166679084301, - "rewards/len_reward_func": 0.3217576891183853, + "grad_norm": 1.9378965639286245, + "kl": 0.2412109375, + "learning_rate": 4.423182781342588e-07, + "loss": 0.0002, + "reward": 0.5831037163734436, + "reward_std": 0.3190010190010071, + "rewards/correct_code_reward_func": 0.1458333358168602, + "rewards/len_reward_func": 0.4372703731060028, "step": 122 }, { - "completion_length": 238.39584350585938, + "completion_length": 31.833334922790527, "epoch": 1.96, - "grad_norm": 0.36929428922486, - "kl": 0.0052337646484375, - "learning_rate": 3.3697790029424413e-10, - "loss": 0.0, - "reward": 0.5057644844055176, - "reward_std": 0.4373111426830292, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.3599311411380768, + "grad_norm": 1.4092055403920425, + "kl": 0.294921875, + "learning_rate": 4.412795978105807e-07, + "loss": 0.0003, + "reward": 0.6458333432674408, + "reward_std": 0.28126100823283195, + "rewards/correct_code_reward_func": 0.14583333395421505, + "rewards/len_reward_func": 0.5, "step": 123 }, { - "completion_length": 126.41667175292969, + "completion_length": 20.75, "epoch": 1.976, - "grad_norm": 0.4901359623088852, - "kl": 0.007110595703125, - "learning_rate": 8.425867412190091e-11, - "loss": 0.0, - "reward": 0.5390879511833191, - "reward_std": 0.30444033443927765, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.3724212795495987, + "grad_norm": 1.8731345483243946, + "kl": 0.3056640625, + "learning_rate": 4.402328917780728e-07, + "loss": 0.0003, + "reward": 0.6243590116500854, + "reward_std": 0.19208041578531265, + "rewards/correct_code_reward_func": 0.1458333395421505, + "rewards/len_reward_func": 0.47852563858032227, "step": 124 }, { - "completion_length": 196.77083587646484, + "completion_length": 34.43750190734863, "epoch": 1.992, - "grad_norm": 0.41890098929296765, - "kl": 0.005767822265625, - "learning_rate": 0.0, - "loss": 0.0, - "reward": 0.5094343423843384, - "reward_std": 0.314863845705986, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4052676856517792, + "grad_norm": 2.251696553247793, + "kl": 0.17919921875, + "learning_rate": 4.391782039544238e-07, + "loss": 0.0002, + "reward": 0.5416666716337204, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.0625, + "rewards/len_reward_func": 0.4791666716337204, "step": 125 }, { - "completion_length": 112.04166793823242, - "epoch": 2.032, - "grad_norm": 0.738755336283657, - "kl": 0.010650634765625, + "completion_length": 20.45833396911621, + "epoch": 2.0, + "grad_norm": 0.7665869817926465, + "kl": 0.16015625, "learning_rate": 4.381155785922225e-07, - "loss": 0.0, - "reward": 0.3895547538995743, - "reward_std": 0.2125089094042778, - "rewards/correct_code_reward_func": 0.02083333395421505, - "rewards/len_reward_func": 0.3687214255332947, + "loss": 0.0001, + "reward": 0.625, + "reward_std": 0.17251639068126678, + "rewards/correct_code_reward_func": 0.125, + "rewards/len_reward_func": 0.5, "step": 126 }, { - "completion_length": 184.4791717529297, - "epoch": 2.048, - "grad_norm": 0.8070204431640005, - "kl": 0.0174560546875, + "completion_length": 22.64583396911621, + "epoch": 2.016, + "grad_norm": 1.4642947533469586, + "kl": 0.23876953125, "learning_rate": 4.37045060277101e-07, - "loss": 0.0, - "reward": 0.4946564882993698, - "reward_std": 0.3218688368797302, - "rewards/correct_code_reward_func": 0.1041666716337204, - "rewards/len_reward_func": 0.390489786863327, + "loss": 0.0002, + "reward": 0.7051360011100769, + "reward_std": 0.33548664301633835, + "rewards/correct_code_reward_func": 0.3125, + "rewards/len_reward_func": 0.3926360011100769, "step": 127 }, { - "completion_length": 119.72917175292969, - "epoch": 2.064, - "grad_norm": 0.5026981022828905, - "kl": 0.00691986083984375, + "completion_length": 25.791667938232422, + "epoch": 2.032, + "grad_norm": 2.2744596355142987, + "kl": 0.23046875, "learning_rate": 4.3596669392586363e-07, - "loss": 0.0, - "reward": 0.5274929702281952, - "reward_std": 0.37383925914764404, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.3191596120595932, + "loss": 0.0002, + "reward": 0.45531921088695526, + "reward_std": 0.2537742704153061, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.41365256905555725, "step": 128 }, { - "completion_length": 93.6875, - "epoch": 2.08, - "grad_norm": 0.4543367066189644, - "kl": 0.006072998046875, + "completion_length": 30.979167938232422, + "epoch": 2.048, + "grad_norm": 1.152083526235397, + "kl": 0.23193359375, "learning_rate": 4.348805247846027e-07, - "loss": 0.0, - "reward": 0.5541968941688538, - "reward_std": 0.313668429851532, - "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.4291968494653702, + "loss": 0.0002, + "reward": 0.6339346170425415, + "reward_std": 0.23110489547252655, + "rewards/correct_code_reward_func": 0.20833333395421505, + "rewards/len_reward_func": 0.4256012886762619, "step": 129 }, { - "completion_length": 107.39583587646484, - "epoch": 2.096, - "grad_norm": 0.8856711441311981, - "kl": 0.01507568359375, + "completion_length": 25.08333396911621, + "epoch": 2.064, + "grad_norm": 2.3077859885366028, + "kl": 0.26806640625, "learning_rate": 4.337865984268001e-07, - "loss": 0.0, - "reward": 0.569198876619339, - "reward_std": 0.41964802145957947, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.3191988617181778, + "loss": 0.0003, + "reward": 0.6608805358409882, + "reward_std": 0.2853649668395519, + "rewards/correct_code_reward_func": 0.18750000558793545, + "rewards/len_reward_func": 0.47338053584098816, "step": 130 }, { - "completion_length": 108.22917175292969, - "epoch": 2.112, - "grad_norm": 0.7996139425979845, - "kl": 0.022613525390625, + "completion_length": 29.854167938232422, + "epoch": 2.08, + "grad_norm": 1.3425058048706657, + "kl": 0.197265625, "learning_rate": 4.326849607514148e-07, - "loss": 0.0, - "reward": 0.4316745698451996, - "reward_std": 0.4076043367385864, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.3275078982114792, + "loss": 0.0002, + "reward": 0.5538955330848694, + "reward_std": 0.28982797265052795, + "rewards/correct_code_reward_func": 0.1041666679084301, + "rewards/len_reward_func": 0.4497288465499878, "step": 131 }, { - "completion_length": 115.31250381469727, - "epoch": 2.128, - "grad_norm": 1.0736973825243263, - "kl": 0.027557373046875, + "completion_length": 30.70833396911621, + "epoch": 2.096, + "grad_norm": 2.173464158764514, + "kl": 0.2490234375, "learning_rate": 4.3157565798095746e-07, - "loss": 0.0, - "reward": 0.5582720190286636, - "reward_std": 0.4011659473180771, + "loss": 0.0002, + "reward": 0.6434731781482697, + "reward_std": 0.31796523183584213, "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.34993869066238403, + "rewards/len_reward_func": 0.43513981997966766, "step": 132 }, { - "completion_length": 94.22916793823242, - "epoch": 2.144, - "grad_norm": 0.6906722352229689, - "kl": 0.016448974609375, + "completion_length": 29.20833396911621, + "epoch": 2.112, + "grad_norm": 1.5233613223232554, + "kl": 0.27880859375, "learning_rate": 4.304587366595505e-07, - "loss": 0.0, - "reward": 0.44807741045951843, - "reward_std": 0.3236899673938751, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.36474408209323883, + "loss": 0.0003, + "reward": 0.6140289306640625, + "reward_std": 0.17255835235118866, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.4473622739315033, "step": 133 }, { - "completion_length": 139.9166717529297, - "epoch": 2.16, - "grad_norm": 0.4747279286645759, - "kl": 0.0152130126953125, + "completion_length": 34.08333396911621, + "epoch": 2.128, + "grad_norm": 2.653484217985997, + "kl": 0.18212890625, "learning_rate": 4.293342436509756e-07, - "loss": 0.0, - "reward": 0.4297265112400055, - "reward_std": 0.1865110769867897, - "rewards/correct_code_reward_func": 0.06250000186264515, - "rewards/len_reward_func": 0.3672265261411667, + "loss": 0.0002, + "reward": 0.6950471103191376, + "reward_std": 0.298052042722702, + "rewards/correct_code_reward_func": 0.2500000074505806, + "rewards/len_reward_func": 0.4450470805168152, "step": 134 }, { - "completion_length": 80.77083587646484, - "epoch": 2.176, - "grad_norm": 0.7513969083123188, - "kl": 0.02288818359375, + "completion_length": 27.6875, + "epoch": 2.144, + "grad_norm": 1.7492354771411964, + "kl": 0.216796875, "learning_rate": 4.282022261367073e-07, - "loss": 0.0, - "reward": 0.6259580850601196, - "reward_std": 0.40002915263175964, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.3759580999612808, + "loss": 0.0002, + "reward": 0.48720741271972656, + "reward_std": 0.2177068144083023, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.40387406945228577, "step": 135 }, { - "completion_length": 132.95833587646484, - "epoch": 2.192, - "grad_norm": 1.5112136833863712, - "kl": 0.02960205078125, + "completion_length": 32.25000190734863, + "epoch": 2.16, + "grad_norm": 1.1359022223853992, + "kl": 0.21728515625, "learning_rate": 4.2706273161393326e-07, - "loss": 0.0, - "reward": 0.4487263262271881, - "reward_std": 0.24802188575267792, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.3862263262271881, + "loss": 0.0002, + "reward": 0.6075980365276337, + "reward_std": 0.19440394639968872, + "rewards/correct_code_reward_func": 0.125, + "rewards/len_reward_func": 0.48259803652763367, "step": 136 }, { - "completion_length": 226.1041717529297, - "epoch": 2.208, - "grad_norm": 0.4826081041388911, - "kl": 0.00726318359375, + "completion_length": 19.77083396911621, + "epoch": 2.176, + "grad_norm": 6.664411559311778, + "kl": 0.345703125, "learning_rate": 4.259158078935615e-07, - "loss": 0.0, - "reward": 0.3881556838750839, - "reward_std": 0.25453752279281616, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.3464890122413635, + "loss": 0.0003, + "reward": 0.8192708492279053, + "reward_std": 0.37694354355335236, + "rewards/correct_code_reward_func": 0.3541666716337204, + "rewards/len_reward_func": 0.4651041626930237, "step": 137 }, { - "completion_length": 130.5208396911621, - "epoch": 2.224, - "grad_norm": 2.325519588373355, - "kl": 0.0572509765625, + "completion_length": 39.125, + "epoch": 2.192, + "grad_norm": 1.456816045019513, + "kl": 0.21484375, "learning_rate": 4.2476150309821437e-07, - "loss": 0.0001, - "reward": 0.3898926377296448, - "reward_std": 0.37525950372219086, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.3482259660959244, + "loss": 0.0002, + "reward": 0.6840218156576157, + "reward_std": 0.2802516594529152, + "rewards/correct_code_reward_func": 0.25, + "rewards/len_reward_func": 0.43402181565761566, "step": 138 }, { - "completion_length": 230.77084350585938, - "epoch": 2.24, - "grad_norm": 0.36697980793716295, - "kl": 0.00534820556640625, + "completion_length": 33.187500953674316, + "epoch": 2.208, + "grad_norm": 1.3547877377100939, + "kl": 0.134033203125, "learning_rate": 4.235998656602091e-07, - "loss": 0.0, - "reward": 0.41781722009181976, - "reward_std": 0.33470213413238525, - "rewards/correct_code_reward_func": 0.06250000186264515, - "rewards/len_reward_func": 0.35531722009181976, + "loss": 0.0001, + "reward": 0.5814135670661926, + "reward_std": 0.23970085382461548, + "rewards/correct_code_reward_func": 0.10416666977107525, + "rewards/len_reward_func": 0.47724688053131104, "step": 139 }, { - "completion_length": 80.14583587646484, - "epoch": 2.2560000000000002, - "grad_norm": 1.0642994877333616, - "kl": 0.03546142578125, + "completion_length": 36.95833492279053, + "epoch": 2.224, + "grad_norm": 1.2720844709707944, + "kl": 0.2001953125, "learning_rate": 4.2243094431952607e-07, - "loss": 0.0, - "reward": 0.5032858103513718, - "reward_std": 0.43102049827575684, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.33661915361881256, + "loss": 0.0002, + "reward": 0.638367086648941, + "reward_std": 0.2601431868970394, + "rewards/correct_code_reward_func": 0.16666667722165585, + "rewards/len_reward_func": 0.47170040011405945, "step": 140 }, { - "completion_length": 67.81250190734863, - "epoch": 2.2720000000000002, - "grad_norm": 0.6181454553082192, - "kl": 0.026611328125, + "completion_length": 39.02083396911621, + "epoch": 2.24, + "grad_norm": 2.1139624484094615, + "kl": 0.16943359375, "learning_rate": 4.2125478812176363e-07, - "loss": 0.0, - "reward": 0.6038364768028259, - "reward_std": 0.4535322040319443, - "rewards/correct_code_reward_func": 0.2500000074505806, - "rewards/len_reward_func": 0.3538364917039871, + "loss": 0.0002, + "reward": 0.5840575993061066, + "reward_std": 0.21259387582540512, + "rewards/correct_code_reward_func": 0.1250000037252903, + "rewards/len_reward_func": 0.4590575695037842, "step": 141 }, { - "completion_length": 203.5416717529297, - "epoch": 2.288, - "grad_norm": 1.0881719262639968, - "kl": 0.02996826171875, + "completion_length": 18.14583396911621, + "epoch": 2.2560000000000002, + "grad_norm": 3.9393507845319564, + "kl": 0.3291015625, "learning_rate": 4.2007144641608035e-07, - "loss": 0.0, - "reward": 0.46139389276504517, - "reward_std": 0.4121973514556885, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.31556057929992676, + "loss": 0.0003, + "reward": 0.7043379247188568, + "reward_std": 0.22067928314208984, + "rewards/correct_code_reward_func": 0.229166679084301, + "rewards/len_reward_func": 0.4751712381839752, "step": 142 }, { - "completion_length": 80.18750381469727, - "epoch": 2.304, - "grad_norm": 0.6632909295929738, - "kl": 0.016937255859375, + "completion_length": 21.312500476837158, + "epoch": 2.2720000000000002, + "grad_norm": 3.09831462467243, + "kl": 0.34912109375, "learning_rate": 4.188809688531241e-07, - "loss": 0.0, - "reward": 0.5671037137508392, - "reward_std": 0.40752534568309784, - "rewards/correct_code_reward_func": 0.2916666716337204, - "rewards/len_reward_func": 0.27543704956769943, + "loss": 0.0003, + "reward": 0.8270089626312256, + "reward_std": 0.38452374935150146, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.4520089328289032, "step": 143 }, { - "completion_length": 107.0, - "epoch": 2.32, - "grad_norm": 0.8436376843098686, - "kl": 0.023773193359375, + "completion_length": 37.895835876464844, + "epoch": 2.288, + "grad_norm": 2.026442157947375, + "kl": 0.178955078125, "learning_rate": 4.1768340538294914e-07, - "loss": 0.0, - "reward": 0.5688771307468414, - "reward_std": 0.3510543406009674, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.3188771605491638, + "loss": 0.0002, + "reward": 0.6618677079677582, + "reward_std": 0.22813905775547028, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.4535343796014786, "step": 144 }, { - "completion_length": 109.56250381469727, - "epoch": 2.336, - "grad_norm": 0.6516366191860941, - "kl": 0.0186767578125, + "completion_length": 20.625, + "epoch": 2.304, + "grad_norm": 5.079652549790452, + "kl": 0.287109375, "learning_rate": 4.1647880625292027e-07, - "loss": 0.0, - "reward": 0.5211996138095856, - "reward_std": 0.3672170042991638, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.3753662556409836, + "loss": 0.0003, + "reward": 0.9085739850997925, + "reward_std": 0.4442262500524521, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.4502406120300293, "step": 145 }, { - "completion_length": 112.14583969116211, - "epoch": 2.352, - "grad_norm": 0.7745258386088306, - "kl": 0.026763916015625, + "completion_length": 26.375000953674316, + "epoch": 2.32, + "grad_norm": 8.765502969688386, + "kl": 0.27099609375, "learning_rate": 4.1526722200560436e-07, - "loss": 0.0, - "reward": 0.4639211893081665, - "reward_std": 0.3190876245498657, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.3805878460407257, + "loss": 0.0003, + "reward": 1.0286418199539185, + "reward_std": 0.5029588490724564, + "rewards/correct_code_reward_func": 0.5833333730697632, + "rewards/len_reward_func": 0.44530846178531647, "step": 146 }, { - "completion_length": 145.2291717529297, - "epoch": 2.368, - "grad_norm": 0.6108882026127638, - "kl": 0.012847900390625, + "completion_length": 33.39583396911621, + "epoch": 2.336, + "grad_norm": 4.101562007961352, + "kl": 0.1669921875, "learning_rate": 4.140487034766499e-07, - "loss": 0.0, - "reward": 0.4563918560743332, - "reward_std": 0.3183119148015976, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.3730585128068924, + "loss": 0.0002, + "reward": 0.9583333432674408, + "reward_std": 0.31142252683639526, + "rewards/correct_code_reward_func": 0.479166679084301, + "rewards/len_reward_func": 0.4791666716337204, "step": 147 }, { - "completion_length": 118.75000381469727, - "epoch": 2.384, - "grad_norm": 0.7877642405429334, - "kl": 0.0147705078125, + "completion_length": 33.250000953674316, + "epoch": 2.352, + "grad_norm": 1.3769018597142264, + "kl": 0.23291015625, "learning_rate": 4.1282330179265377e-07, - "loss": 0.0, - "reward": 0.4855644255876541, - "reward_std": 0.40131522715091705, - "rewards/correct_code_reward_func": 0.1875000074505806, - "rewards/len_reward_func": 0.2980644255876541, + "loss": 0.0002, + "reward": 0.728137880563736, + "reward_std": 0.3594149053096771, + "rewards/correct_code_reward_func": 0.2500000074505806, + "rewards/len_reward_func": 0.47813786566257477, "step": 148 }, { - "completion_length": 107.54166793823242, - "epoch": 2.4, - "grad_norm": 0.5251281341532755, - "kl": 0.01385498046875, + "completion_length": 35.10416793823242, + "epoch": 2.368, + "grad_norm": 1.764712623454699, + "kl": 0.187255859375, "learning_rate": 4.115910683690167e-07, - "loss": 0.0, - "reward": 0.47337688505649567, - "reward_std": 0.39969322085380554, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.36921024322509766, + "loss": 0.0002, + "reward": 0.6034694612026215, + "reward_std": 0.18779680132865906, + "rewards/correct_code_reward_func": 0.1458333432674408, + "rewards/len_reward_func": 0.45763610303401947, "step": 149 }, { - "completion_length": 116.52083587646484, - "epoch": 2.416, - "grad_norm": 0.5833622240495556, - "kl": 0.0223388671875, + "completion_length": 23.58333396911621, + "epoch": 2.384, + "grad_norm": 1.5094768705414543, + "kl": 0.3095703125, "learning_rate": 4.1035205490778496e-07, - "loss": 0.0, - "reward": 0.40829063951969147, - "reward_std": 0.2583230659365654, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.36662398278713226, + "loss": 0.0003, + "reward": 0.8031023442745209, + "reward_std": 0.2327413372695446, + "rewards/correct_code_reward_func": 0.3541666716337204, + "rewards/len_reward_func": 0.44893570244312286, "step": 150 }, { - "completion_length": 83.50000381469727, - "epoch": 2.432, - "grad_norm": 0.5302455273183175, - "kl": 0.015350341796875, + "completion_length": 35.125, + "epoch": 2.4, + "grad_norm": 2.0077887373974734, + "kl": 0.19970703125, "learning_rate": 4.09106313395482e-07, - "loss": 0.0, - "reward": 0.4789588153362274, - "reward_std": 0.3553490489721298, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.374792143702507, + "loss": 0.0002, + "reward": 0.7374315559864044, + "reward_std": 0.3449878916144371, + "rewards/correct_code_reward_func": 0.3125000111758709, + "rewards/len_reward_func": 0.42493152618408203, "step": 151 }, { - "completion_length": 95.95833587646484, - "epoch": 2.448, - "grad_norm": 0.6468368576991631, - "kl": 0.01800537109375, + "completion_length": 27.979167938232422, + "epoch": 2.416, + "grad_norm": 1.6931078080658426, + "kl": 0.30908203125, "learning_rate": 4.078538961009268e-07, - "loss": 0.0, - "reward": 0.6159647405147552, - "reward_std": 0.4978296458721161, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.34513136744499207, + "loss": 0.0003, + "reward": 0.7056878507137299, + "reward_std": 0.2828039154410362, + "rewards/correct_code_reward_func": 0.25000000558793545, + "rewards/len_reward_func": 0.45568785071372986, "step": 152 }, { - "completion_length": 132.04166793823242, - "epoch": 2.464, - "grad_norm": 0.6586059121985153, - "kl": 0.02117919921875, + "completion_length": 26.041667938232422, + "epoch": 2.432, + "grad_norm": 3.4873197344938505, + "kl": 0.234375, "learning_rate": 4.0659485557304047e-07, - "loss": 0.0, - "reward": 0.6003018617630005, - "reward_std": 0.3758309483528137, - "rewards/correct_code_reward_func": 0.2500000074505806, - "rewards/len_reward_func": 0.3503018915653229, + "loss": 0.0002, + "reward": 0.8596743643283844, + "reward_std": 0.3951665312051773, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.4846743494272232, "step": 153 }, { - "completion_length": 93.97916793823242, - "epoch": 2.48, - "grad_norm": 1.2292815459155835, - "kl": 0.03778076171875, + "completion_length": 25.416667938232422, + "epoch": 2.448, + "grad_norm": 0.5832761487278126, + "kl": 0.28125, "learning_rate": 4.0532924463864214e-07, - "loss": 0.0, - "reward": 0.556082546710968, - "reward_std": 0.35523243248462677, - "rewards/correct_code_reward_func": 0.2083333358168602, - "rewards/len_reward_func": 0.3477492183446884, + "loss": 0.0003, + "reward": 0.875, + "reward_std": 0.16623876243829727, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.4791666716337204, "step": 154 }, { - "completion_length": 74.62500381469727, - "epoch": 2.496, - "grad_norm": 1.1496692424806267, - "kl": 0.0469970703125, + "completion_length": 33.75000190734863, + "epoch": 2.464, + "grad_norm": 3.3108288489196216, + "kl": 0.24169921875, "learning_rate": 4.040571164002318e-07, - "loss": 0.0, - "reward": 0.5786138772964478, - "reward_std": 0.33524368703365326, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.37028053402900696, + "loss": 0.0002, + "reward": 0.8077309429645538, + "reward_std": 0.4106632024049759, + "rewards/correct_code_reward_func": 0.3333333544433117, + "rewards/len_reward_func": 0.47439758479595184, "step": 155 }, { - "completion_length": 112.20833587646484, - "epoch": 2.512, - "grad_norm": 0.5223652262852928, - "kl": 0.0330810546875, + "completion_length": 28.02083396911621, + "epoch": 2.48, + "grad_norm": 1.642835238916714, + "kl": 0.23388671875, "learning_rate": 4.027785242337625e-07, - "loss": 0.0, - "reward": 0.5203375518321991, - "reward_std": 0.28021611273288727, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.3953375071287155, + "loss": 0.0002, + "reward": 0.7023809850215912, + "reward_std": 0.30807141959667206, + "rewards/correct_code_reward_func": 0.229166679084301, + "rewards/len_reward_func": 0.4732142984867096, "step": 156 }, { - "completion_length": 74.27083587646484, - "epoch": 2.528, - "grad_norm": 0.8743818514712977, - "kl": 0.035888671875, + "completion_length": 23.375, + "epoch": 2.496, + "grad_norm": 2.10881607764319, + "kl": 0.29638671875, "learning_rate": 4.0149352178640084e-07, - "loss": 0.0, - "reward": 0.5434559881687164, - "reward_std": 0.2699009105563164, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.29345598816871643, + "loss": 0.0003, + "reward": 0.7916666865348816, + "reward_std": 0.2994871214032173, + "rewards/correct_code_reward_func": 0.2916666679084301, + "rewards/len_reward_func": 0.5, "step": 157 }, { - "completion_length": 76.45833587646484, - "epoch": 2.544, - "grad_norm": 1.0770287409056003, - "kl": 0.0513916015625, + "completion_length": 39.437500953674316, + "epoch": 2.512, + "grad_norm": 1.494083480221189, + "kl": 0.24658203125, "learning_rate": 4.002021629742759e-07, - "loss": 0.0001, - "reward": 0.5278337150812149, - "reward_std": 0.3877341002225876, - "rewards/correct_code_reward_func": 0.1875000111758709, - "rewards/len_reward_func": 0.3403336852788925, + "loss": 0.0002, + "reward": 0.6607388556003571, + "reward_std": 0.2653798274695873, + "rewards/correct_code_reward_func": 0.2083333395421505, + "rewards/len_reward_func": 0.45240549743175507, "step": 158 }, { - "completion_length": 95.16667175292969, - "epoch": 2.56, - "grad_norm": 0.6512930175227467, - "kl": 0.0189208984375, + "completion_length": 24.83333396911621, + "epoch": 2.528, + "grad_norm": 4.504000272348399, + "kl": 0.30859375, "learning_rate": 3.9890450198021705e-07, - "loss": 0.0, - "reward": 0.5369605869054794, - "reward_std": 0.34080107510089874, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.39112721383571625, + "loss": 0.0003, + "reward": 0.8541666865348816, + "reward_std": 0.36753228306770325, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.5, "step": 159 }, { - "completion_length": 103.89583969116211, - "epoch": 2.576, - "grad_norm": 0.8478914110597111, - "kl": 0.05859375, + "completion_length": 26.45833396911621, + "epoch": 2.544, + "grad_norm": 4.070154440728412, + "kl": 0.3583984375, "learning_rate": 3.9760059325148063e-07, - "loss": 0.0001, - "reward": 0.5183965861797333, - "reward_std": 0.43399062752723694, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.3725632578134537, + "loss": 0.0004, + "reward": 1.0000000298023224, + "reward_std": 0.15430335700511932, + "rewards/correct_code_reward_func": 0.5000000298023224, + "rewards/len_reward_func": 0.5, "step": 160 }, { - "completion_length": 110.10417175292969, - "epoch": 2.592, - "grad_norm": 0.5810833950613928, - "kl": 0.025909423828125, + "completion_length": 31.791667938232422, + "epoch": 2.56, + "grad_norm": 1.213750670059646, + "kl": 0.24755859375, "learning_rate": 3.9629049149746556e-07, - "loss": 0.0, - "reward": 0.46043863892555237, - "reward_std": 0.25851407647132874, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.37710531055927277, + "loss": 0.0002, + "reward": 0.8541666865348816, + "reward_std": 0.3857453167438507, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.5, "step": 161 }, { - "completion_length": 68.79166793823242, - "epoch": 2.608, - "grad_norm": 0.8943010580879761, - "kl": 0.060302734375, + "completion_length": 35.500000953674316, + "epoch": 2.576, + "grad_norm": 4.880537766814768, + "kl": 0.2998046875, "learning_rate": 3.949742516874175e-07, - "loss": 0.0001, - "reward": 0.5949756503105164, - "reward_std": 0.3949763774871826, - "rewards/correct_code_reward_func": 0.2083333358168602, - "rewards/len_reward_func": 0.38664232194423676, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.2840898931026459, + "rewards/correct_code_reward_func": 0.2708333358168602, + "rewards/len_reward_func": 0.4791666716337204, "step": 162 }, { - "completion_length": 79.12500381469727, - "epoch": 2.624, - "grad_norm": 0.8192201790358881, - "kl": 0.05584716796875, + "completion_length": 32.81250190734863, + "epoch": 2.592, + "grad_norm": 0.6480053728324462, + "kl": 0.2451171875, "learning_rate": 3.9365192904812263e-07, - "loss": 0.0001, - "reward": 0.4389507919549942, - "reward_std": 0.35593053698539734, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.3556174486875534, + "loss": 0.0002, + "reward": 0.7500000298023224, + "reward_std": 0.15430335700511932, + "rewards/correct_code_reward_func": 0.2500000074505806, + "rewards/len_reward_func": 0.5, "step": 163 }, { - "completion_length": 72.52083587646484, - "epoch": 2.64, - "grad_norm": 0.8024396739930885, - "kl": 0.044677734375, + "completion_length": 21.625000953674316, + "epoch": 2.608, + "grad_norm": 5.168623364947485, + "kl": 0.3251953125, "learning_rate": 3.9232357906159065e-07, - "loss": 0.0, - "reward": 0.4468836784362793, - "reward_std": 0.25362110137939453, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.3427170366048813, + "loss": 0.0003, + "reward": 0.9677360653877258, + "reward_std": 0.36441104114055634, + "rewards/correct_code_reward_func": 0.4791666716337204, + "rewards/len_reward_func": 0.48856931924819946, "step": 164 }, { - "completion_length": 104.27083587646484, - "epoch": 2.656, - "grad_norm": 0.4873005506771281, - "kl": 0.020751953125, + "completion_length": 27.229166984558105, + "epoch": 2.624, + "grad_norm": 1.7162957930860199, + "kl": 0.20654296875, "learning_rate": 3.909892574627266e-07, - "loss": 0.0, - "reward": 0.5589407980442047, - "reward_std": 0.454597607254982, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.35060742497444153, + "loss": 0.0002, + "reward": 0.7291666865348816, + "reward_std": 0.1480126492679119, + "rewards/correct_code_reward_func": 0.229166679084301, + "rewards/len_reward_func": 0.5, "step": 165 }, { - "completion_length": 144.52083587646484, - "epoch": 2.672, - "grad_norm": 0.8704908637654086, - "kl": 0.041290283203125, + "completion_length": 26.562500953674316, + "epoch": 2.64, + "grad_norm": 7.012645050314315, + "kl": 0.265625, "learning_rate": 3.8964902023699234e-07, - "loss": 0.0, - "reward": 0.49246758222579956, - "reward_std": 0.40906646847724915, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.34663423895835876, + "loss": 0.0003, + "reward": 0.8125, + "reward_std": 0.21322893351316452, + "rewards/correct_code_reward_func": 0.3125, + "rewards/len_reward_func": 0.5, "step": 166 }, { - "completion_length": 96.58333587646484, - "epoch": 2.6879999999999997, - "grad_norm": 1.0438466006168627, - "kl": 0.043701171875, + "completion_length": 33.4375, + "epoch": 2.656, + "grad_norm": 1.4618104253449193, + "kl": 0.29443359375, "learning_rate": 3.8830292361805767e-07, - "loss": 0.0, - "reward": 0.4835583567619324, - "reward_std": 0.34541426599025726, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.37939170002937317, + "loss": 0.0003, + "reward": 0.8750000596046448, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.5, "step": 167 }, { - "completion_length": 78.66666793823242, - "epoch": 2.7039999999999997, - "grad_norm": 0.7819118541765582, - "kl": 0.05108642578125, + "completion_length": 50.333335876464844, + "epoch": 2.672, + "grad_norm": 1.1548968690502175, + "kl": 0.3291015625, "learning_rate": 3.869510240854407e-07, - "loss": 0.0001, - "reward": 0.6229337453842163, - "reward_std": 0.47181543707847595, - "rewards/correct_code_reward_func": 0.2291666716337204, - "rewards/len_reward_func": 0.3937670886516571, + "loss": 0.0003, + "reward": 0.7708333432674408, + "reward_std": 0.13607724010944366, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.4791666716337204, "step": 168 }, { - "completion_length": 68.04166984558105, - "epoch": 2.7199999999999998, - "grad_norm": 0.6321138877784398, - "kl": 0.0426025390625, + "completion_length": 32.77083492279053, + "epoch": 2.6879999999999997, + "grad_norm": 1.7708211120173893, + "kl": 0.22509765625, "learning_rate": 3.855933783621383e-07, - "loss": 0.0, - "reward": 0.5318568348884583, - "reward_std": 0.40405046939849854, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.36519019305706024, + "loss": 0.0002, + "reward": 0.5873316824436188, + "reward_std": 0.34993261098861694, + "rewards/correct_code_reward_func": 0.1250000037252903, + "rewards/len_reward_func": 0.4623316675424576, "step": 169 }, { - "completion_length": 88.56250381469727, - "epoch": 2.7359999999999998, - "grad_norm": 0.5828268627562292, - "kl": 0.06005859375, + "completion_length": 37.14583396911621, + "epoch": 2.7039999999999997, + "grad_norm": 0.8273511612323733, + "kl": 0.24169921875, "learning_rate": 3.8423004341224595e-07, - "loss": 0.0001, - "reward": 0.5125512927770615, - "reward_std": 0.20025938376784325, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.4500512629747391, + "loss": 0.0002, + "reward": 1.125, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.625, + "rewards/len_reward_func": 0.5, "step": 170 }, { - "completion_length": 52.95833396911621, - "epoch": 2.752, - "grad_norm": 1.0950968529016254, - "kl": 0.093505859375, + "completion_length": 25.854167938232422, + "epoch": 2.7199999999999998, + "grad_norm": 4.844327377758548, + "kl": 0.296875, "learning_rate": 3.828610764385676e-07, - "loss": 0.0001, - "reward": 0.5501560568809509, - "reward_std": 0.2839180529117584, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.38348935544490814, + "loss": 0.0003, + "reward": 0.8125000298023224, + "reward_std": 0.3205290399491787, + "rewards/correct_code_reward_func": 0.31250002048909664, + "rewards/len_reward_func": 0.5, "step": 171 }, { - "completion_length": 76.64583587646484, - "epoch": 2.768, - "grad_norm": 0.8956790920239291, - "kl": 0.052734375, + "completion_length": 37.58333492279053, + "epoch": 2.7359999999999998, + "grad_norm": 0.85750009835101, + "kl": 0.22412109375, "learning_rate": 3.8148653488021566e-07, - "loss": 0.0001, - "reward": 0.476114884018898, - "reward_std": 0.3231390118598938, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.351114884018898, + "loss": 0.0002, + "reward": 0.8333333730697632, + "reward_std": 0.17251639068126678, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, "step": 172 }, { - "completion_length": 85.87500381469727, - "epoch": 2.784, - "grad_norm": 0.6452120175663968, - "kl": 0.03240966796875, + "completion_length": 19.39583396911621, + "epoch": 2.752, + "grad_norm": 4.546104049005407, + "kl": 0.388671875, "learning_rate": 3.801064764102011e-07, - "loss": 0.0, - "reward": 0.3626401424407959, - "reward_std": 0.2630883455276489, - "rewards/correct_code_reward_func": 0.02083333395421505, - "rewards/len_reward_func": 0.3418068289756775, + "loss": 0.0004, + "reward": 0.9583333730697632, + "reward_std": 0.3268197476863861, + "rewards/correct_code_reward_func": 0.458333358168602, + "rewards/len_reward_func": 0.5, "step": 173 }, { - "completion_length": 69.93750381469727, - "epoch": 2.8, - "grad_norm": 0.899618033727736, - "kl": 0.034423828125, + "completion_length": 21.041666984558105, + "epoch": 2.768, + "grad_norm": 2.488704592885805, + "kl": 0.328125, "learning_rate": 3.787209589330134e-07, - "loss": 0.0, - "reward": 0.565394401550293, - "reward_std": 0.2511584088206291, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.4403943866491318, - "step": 174 + "loss": 0.0003, + "reward": 0.8333333432674408, + "reward_std": 0.2903675436973572, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, + "step": 174 }, { - "completion_length": 75.6875, - "epoch": 2.816, - "grad_norm": 0.9258826637058537, - "kl": 0.0552978515625, + "completion_length": 25.750000953674316, + "epoch": 2.784, + "grad_norm": 3.4706174977001085, + "kl": 0.3203125, "learning_rate": 3.773300405821908e-07, - "loss": 0.0001, - "reward": 0.5098282545804977, - "reward_std": 0.347461462020874, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.36399491131305695, + "loss": 0.0003, + "reward": 0.625, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.125, + "rewards/len_reward_func": 0.5, "step": 175 }, { - "completion_length": 79.375, - "epoch": 2.832, - "grad_norm": 0.8016591610038067, - "kl": 0.0675048828125, + "completion_length": 22.125000953674316, + "epoch": 2.8, + "grad_norm": 6.035407454671845, + "kl": 0.333984375, "learning_rate": 3.759337797178816e-07, - "loss": 0.0001, - "reward": 0.49655479192733765, - "reward_std": 0.15379413217306137, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.41322144865989685, + "loss": 0.0003, + "reward": 0.7083333730697632, + "reward_std": 0.24966806918382645, + "rewards/correct_code_reward_func": 0.2083333395421505, + "rewards/len_reward_func": 0.5, "step": 176 }, { - "completion_length": 57.47916793823242, - "epoch": 2.848, - "grad_norm": 1.5666794069659344, - "kl": 0.0653076171875, + "completion_length": 26.854166984558105, + "epoch": 2.816, + "grad_norm": 0.8948891454908713, + "kl": 0.4052734375, "learning_rate": 3.745322349243954e-07, - "loss": 0.0001, - "reward": 0.5412343591451645, - "reward_std": 0.4381273090839386, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.3537343591451645, + "loss": 0.0004, + "reward": 0.7708333730697632, + "reward_std": 0.14801263809204102, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.5, "step": 177 }, { - "completion_length": 85.35416793823242, - "epoch": 2.864, - "grad_norm": 0.48459539836244053, - "kl": 0.0509033203125, + "completion_length": 43.39583396911621, + "epoch": 2.832, + "grad_norm": 1.4563433011031992, + "kl": 0.215087890625, "learning_rate": 3.7312546500774455e-07, - "loss": 0.0001, - "reward": 0.5237628370523453, - "reward_std": 0.15980007499456406, - "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.3987628370523453, + "loss": 0.0002, + "reward": 0.8541666865348816, + "reward_std": 0.30859364569187164, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.5, "step": 178 }, { - "completion_length": 78.20833396911621, - "epoch": 2.88, - "grad_norm": 1.3898787394908887, - "kl": 0.05059814453125, + "completion_length": 17.562500953674316, + "epoch": 2.848, + "grad_norm": 6.496732643288798, + "kl": 0.3759765625, "learning_rate": 3.717135289931774e-07, - "loss": 0.0001, - "reward": 0.5607030093669891, - "reward_std": 0.3186104744672775, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.41486963629722595, + "loss": 0.0004, + "reward": 0.8333333432674408, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, "step": 179 }, { - "completion_length": 89.25000381469727, - "epoch": 2.896, - "grad_norm": 0.9218365412020941, - "kl": 0.09429931640625, + "completion_length": 43.83333396911621, + "epoch": 2.864, + "grad_norm": 0.862953352923827, + "kl": 0.63671875, "learning_rate": 3.7029648612270123e-07, - "loss": 0.0001, - "reward": 0.49387598037719727, - "reward_std": 0.3254713863134384, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.41054263710975647, + "loss": 0.0006, + "reward": 0.7500000298023224, + "reward_std": 0.22233543917536736, + "rewards/correct_code_reward_func": 0.2500000149011612, + "rewards/len_reward_func": 0.5, "step": 180 }, { - "completion_length": 79.50000381469727, - "epoch": 2.912, - "grad_norm": 0.8027279022718086, - "kl": 0.052734375, + "completion_length": 44.35416793823242, + "epoch": 2.88, + "grad_norm": 1.1006354405018577, + "kl": 0.19970703125, "learning_rate": 3.688743958525969e-07, - "loss": 0.0001, - "reward": 0.464905709028244, - "reward_std": 0.3140018880367279, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.3815723806619644, + "loss": 0.0002, + "reward": 0.7916666865348816, + "reward_std": 0.2840898931026459, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.5, "step": 181 }, { - "completion_length": 81.16666793823242, - "epoch": 2.928, - "grad_norm": 1.170123308898757, - "kl": 0.0556640625, + "completion_length": 30.02083396911621, + "epoch": 2.896, + "grad_norm": 0.43741770998780366, + "kl": 0.3212890625, "learning_rate": 3.6744731785092393e-07, - "loss": 0.0001, - "reward": 0.5409459471702576, - "reward_std": 0.15436074882745743, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.374279260635376, + "loss": 0.0003, + "reward": 0.7500000298023224, + "reward_std": 0.08908708393573761, + "rewards/correct_code_reward_func": 0.2500000074505806, + "rewards/len_reward_func": 0.5, "step": 182 }, { - "completion_length": 74.64583587646484, - "epoch": 2.944, - "grad_norm": 3.279187753830931, - "kl": 0.1982421875, + "completion_length": 39.291666984558105, + "epoch": 2.912, + "grad_norm": 1.0212558082151113, + "kl": 0.263671875, "learning_rate": 3.660153119950171e-07, - "loss": 0.0002, - "reward": 0.6114458441734314, - "reward_std": 0.4259416460990906, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.4031124860048294, + "loss": 0.0003, + "reward": 0.7708333730697632, + "reward_std": 0.14801263809204102, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.5, "step": 183 }, { - "completion_length": 99.12500381469727, - "epoch": 2.96, - "grad_norm": 1.2656756790220016, - "kl": 0.0599365234375, + "completion_length": 32.14583492279053, + "epoch": 2.928, + "grad_norm": 1.0645346014541865, + "kl": 0.28955078125, "learning_rate": 3.6457843836897417e-07, - "loss": 0.0001, - "reward": 0.45475544035434723, - "reward_std": 0.308984711766243, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.3922554701566696, + "loss": 0.0003, + "reward": 0.9375, + "reward_std": 0.3630879074335098, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.4791666716337204, "step": 184 }, { - "completion_length": 69.2916669845581, - "epoch": 2.976, - "grad_norm": 0.9538264487190198, - "kl": 0.114501953125, + "completion_length": 27.70833396911621, + "epoch": 2.944, + "grad_norm": 4.902484194089981, + "kl": 0.3564453125, "learning_rate": 3.6313675726113475e-07, - "loss": 0.0001, - "reward": 0.5234346687793732, - "reward_std": 0.25931820273399353, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4192679822444916, + "loss": 0.0004, + "reward": 0.979166716337204, + "reward_std": 0.33592626452445984, + "rewards/correct_code_reward_func": 0.4791666865348816, + "rewards/len_reward_func": 0.5, "step": 185 }, { - "completion_length": 113.68750381469727, - "epoch": 2.992, - "grad_norm": 0.6804643090555511, - "kl": 0.0214080810546875, + "completion_length": 38.5, + "epoch": 2.96, + "grad_norm": 2.7460144649848104, + "kl": 0.2890625, "learning_rate": 3.6169032916155055e-07, - "loss": 0.0, - "reward": 0.561868205666542, - "reward_std": 0.21909521520137787, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.39520153403282166, + "loss": 0.0003, + "reward": 0.7708333432674408, + "reward_std": 0.24056155234575272, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.5, "step": 186 }, { - "completion_length": 54.75000127156576, - "epoch": 3.016, - "grad_norm": 0.6662972654123027, - "kl": 0.06339518229166667, + "completion_length": 30.375001907348633, + "epoch": 2.976, + "grad_norm": 1.5868616221089409, + "kl": 0.4033203125, "learning_rate": 3.602392147594479e-07, - "loss": 0.0001, - "reward": 0.5856050451596578, - "reward_std": 0.20766562471787134, - "rewards/correct_code_reward_func": 0.1250000074505806, - "rewards/len_reward_func": 0.46060502529144287, + "loss": 0.0004, + "reward": 0.9375000298023224, + "reward_std": 0.30231601744890213, + "rewards/correct_code_reward_func": 0.4375000149011612, + "rewards/len_reward_func": 0.5, "step": 187 }, { - "completion_length": 112.5625, - "epoch": 3.032, - "grad_norm": 0.9637994129535737, - "kl": 0.06414794921875, + "completion_length": 31.979167938232422, + "epoch": 2.992, + "grad_norm": 1.6492447606938656, + "kl": 0.765625, "learning_rate": 3.587834749406808e-07, - "loss": 0.0001, - "reward": 0.5407692492008209, - "reward_std": 0.4349091351032257, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.37410253286361694, + "loss": 0.0008, + "reward": 0.791666716337204, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.29166667722165585, + "rewards/len_reward_func": 0.5, "step": 188 }, { - "completion_length": 86.10416793823242, - "epoch": 3.048, - "grad_norm": 0.7131912130076471, - "kl": 0.0755615234375, + "completion_length": 34.583335876464844, + "epoch": 3.0, + "grad_norm": 1.6492447606938656, + "kl": 0.353515625, "learning_rate": 3.573231707851765e-07, - "loss": 0.0001, - "reward": 0.7036676704883575, - "reward_std": 0.2348622828722, - "rewards/correct_code_reward_func": 0.2291666716337204, - "rewards/len_reward_func": 0.47450098395347595, + "loss": 0.0002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.0, + "rewards/len_reward_func": 0.5, "step": 189 }, { - "completion_length": 85.54166984558105, - "epoch": 3.064, - "grad_norm": 0.7623498616231758, - "kl": 0.07427978515625, + "completion_length": 25.437501907348633, + "epoch": 3.016, + "grad_norm": 0.6202012399572476, + "kl": 0.328125, "learning_rate": 3.558583635643726e-07, - "loss": 0.0001, - "reward": 0.5809785723686218, - "reward_std": 0.3845784664154053, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.3934786021709442, + "loss": 0.0003, + "reward": 0.7916666865348816, + "reward_std": 0.2342708557844162, + "rewards/correct_code_reward_func": 0.291666679084301, + "rewards/len_reward_func": 0.5, "step": 190 }, { - "completion_length": 79.85416793823242, - "epoch": 3.08, - "grad_norm": 1.2708683319836989, - "kl": 0.0460205078125, + "completion_length": 45.16666793823242, + "epoch": 3.032, + "grad_norm": 3.268492949679133, + "kl": 0.30029296875, "learning_rate": 3.543891147386463e-07, - "loss": 0.0, - "reward": 0.6348311603069305, - "reward_std": 0.42308229207992554, - "rewards/correct_code_reward_func": 0.29166667722165585, - "rewards/len_reward_func": 0.34316447377204895, + "loss": 0.0003, + "reward": 0.9375, + "reward_std": 0.22516433894634247, + "rewards/correct_code_reward_func": 0.4375, + "rewards/len_reward_func": 0.5, "step": 191 }, { - "completion_length": 70.66666984558105, - "epoch": 3.096, - "grad_norm": 0.8954738617939487, - "kl": 0.090087890625, + "completion_length": 24.45833396911621, + "epoch": 3.048, + "grad_norm": 5.775551595741029, + "kl": 0.31640625, "learning_rate": 3.52915485954736e-07, - "loss": 0.0001, - "reward": 0.48272156715393066, - "reward_std": 0.35741594433784485, - "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.35772158205509186, + "loss": 0.0003, + "reward": 1.125, + "reward_std": 0.2994871102273464, + "rewards/correct_code_reward_func": 0.625, + "rewards/len_reward_func": 0.5, "step": 192 }, { - "completion_length": 55.520835876464844, - "epoch": 3.112, - "grad_norm": 0.6882137502479935, - "kl": 0.0472412109375, + "completion_length": 33.77083396911621, + "epoch": 3.064, + "grad_norm": 1.32331402024035, + "kl": 11.91748046875, "learning_rate": 3.514375390431539e-07, - "loss": 0.0, - "reward": 0.7783247828483582, - "reward_std": 0.37427644431591034, - "rewards/correct_code_reward_func": 0.3333333432674408, - "rewards/len_reward_func": 0.44499143958091736, + "loss": 0.012, + "reward": 0.7708333730697632, + "reward_std": 0.16340987384319305, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.5, "step": 193 }, { - "completion_length": 94.41666984558105, - "epoch": 3.128, - "grad_norm": 1.0959233148257266, - "kl": 0.066162109375, + "completion_length": 25.041667938232422, + "epoch": 3.08, + "grad_norm": 15.903056378391673, + "kl": 0.3701171875, "learning_rate": 3.4995533601559225e-07, - "loss": 0.0001, - "reward": 0.55581995844841, - "reward_std": 0.2805578410625458, - "rewards/correct_code_reward_func": 0.18750000558793545, - "rewards/len_reward_func": 0.36831995844841003, + "loss": 0.0004, + "reward": 1.1250000596046448, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.6250000149011612, + "rewards/len_reward_func": 0.5, "step": 194 }, { - "completion_length": 101.77083969116211, - "epoch": 3.144, - "grad_norm": 0.46379398279748846, - "kl": 0.03558349609375, + "completion_length": 32.10416793823242, + "epoch": 3.096, + "grad_norm": 0.8594817829407627, + "kl": 1.7802734375, "learning_rate": 3.484689390623218e-07, - "loss": 0.0, - "reward": 0.5298469811677933, - "reward_std": 0.354349821805954, - "rewards/correct_code_reward_func": 0.10416666977107525, - "rewards/len_reward_func": 0.42568032443523407, + "loss": 0.0018, + "reward": 0.6875000298023224, + "reward_std": 0.1767766959965229, + "rewards/correct_code_reward_func": 0.1875000111758709, + "rewards/len_reward_func": 0.5, "step": 195 }, { - "completion_length": 73.64583587646484, - "epoch": 3.16, - "grad_norm": 1.837985634972012, - "kl": 0.13818359375, + "completion_length": 16.645833492279053, + "epoch": 3.112, + "grad_norm": 2.729844658975061, + "kl": 0.443359375, "learning_rate": 3.469784105495816e-07, - "loss": 0.0001, - "reward": 0.5660044550895691, - "reward_std": 0.34897130727767944, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.3576711118221283, + "loss": 0.0004, + "reward": 1.2708333730697632, + "reward_std": 0.1767766959965229, + "rewards/correct_code_reward_func": 0.7708333432674408, + "rewards/len_reward_func": 0.5, "step": 196 }, { - "completion_length": 67.41666793823242, - "epoch": 3.176, - "grad_norm": 0.7917900411664024, - "kl": 0.114990234375, + "completion_length": 42.00000286102295, + "epoch": 3.128, + "grad_norm": 2.9296646774191917, + "kl": 0.53515625, "learning_rate": 3.4548381301696295e-07, - "loss": 0.0001, - "reward": 0.4153170883655548, - "reward_std": 0.26159487664699554, - "rewards/correct_code_reward_func": 0.02083333395421505, - "rewards/len_reward_func": 0.3944837599992752, + "loss": 0.0005, + "reward": 0.916666716337204, + "reward_std": 0.3794546127319336, + "rewards/correct_code_reward_func": 0.4166666865348816, + "rewards/len_reward_func": 0.5, "step": 197 }, { - "completion_length": 85.29166793823242, - "epoch": 3.192, - "grad_norm": 1.0325385655762465, - "kl": 0.0819091796875, + "completion_length": 45.5, + "epoch": 3.144, + "grad_norm": 1.7226032460824214, + "kl": 0.4296875, "learning_rate": 3.4398520917478476e-07, - "loss": 0.0001, - "reward": 0.5381573736667633, - "reward_std": 0.0840181726962328, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.4756573736667633, + "loss": 0.0004, + "reward": 0.8958333730697632, + "reward_std": 0.28126100450754166, + "rewards/correct_code_reward_func": 0.395833358168602, + "rewards/len_reward_func": 0.5, "step": 198 }, { - "completion_length": 68.29166793823242, - "epoch": 3.208, - "grad_norm": 0.7914760708098421, - "kl": 0.16455078125, + "completion_length": 33.79166793823242, + "epoch": 3.16, + "grad_norm": 1.0268275604382027, + "kl": 0.3486328125, "learning_rate": 3.42482661901463e-07, - "loss": 0.0002, - "reward": 0.6617700159549713, - "reward_std": 0.2904653549194336, - "rewards/correct_code_reward_func": 0.229166679084301, - "rewards/len_reward_func": 0.4326033294200897, + "loss": 0.0003, + "reward": 0.7291666865348816, + "reward_std": 0.22516431659460068, + "rewards/correct_code_reward_func": 0.2500000111758709, + "rewards/len_reward_func": 0.4791666716337204, "step": 199 }, { - "completion_length": 85.45833587646484, - "epoch": 3.224, - "grad_norm": 1.2976333398825715, - "kl": 0.0872802734375, + "completion_length": 35.43750190734863, + "epoch": 3.176, + "grad_norm": 1.0861549087201527, + "kl": 0.287109375, "learning_rate": 3.409762342408719e-07, - "loss": 0.0001, - "reward": 0.539979562163353, - "reward_std": 0.22026720643043518, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.45664621889591217, + "loss": 0.0003, + "reward": 0.7500000298023224, + "reward_std": 0.19500280916690826, + "rewards/correct_code_reward_func": 0.2500000111758709, + "rewards/len_reward_func": 0.5, "step": 200 }, { - "completion_length": 61.437503814697266, - "epoch": 3.24, - "grad_norm": 0.7518792416547331, - "kl": 0.090087890625, + "completion_length": 55.479166984558105, + "epoch": 3.192, + "grad_norm": 1.5283228572290333, + "kl": 0.375, "learning_rate": 3.3946598939969893e-07, - "loss": 0.0001, - "reward": 0.7156426012516022, - "reward_std": 0.29580071568489075, - "rewards/correct_code_reward_func": 0.2500000111758709, - "rewards/len_reward_func": 0.4656426012516022, + "loss": 0.0004, + "reward": 0.8333333730697632, + "reward_std": 0.15430335700511932, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, "step": 201 }, { - "completion_length": 45.72916793823242, - "epoch": 3.2560000000000002, - "grad_norm": 0.779956862868322, - "kl": 0.0845947265625, + "completion_length": 23.9375, + "epoch": 3.208, + "grad_norm": 1.2041133220324134, + "kl": 0.4013671875, "learning_rate": 3.379519907447931e-07, - "loss": 0.0001, - "reward": 0.5846085548400879, - "reward_std": 0.30107998102903366, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.4179418534040451, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.24339044094085693, + "rewards/correct_code_reward_func": 0.5, + "rewards/len_reward_func": 0.5, "step": 202 }, { - "completion_length": 70.35417175292969, - "epoch": 3.2720000000000002, - "grad_norm": 0.9057354477673044, - "kl": 0.125244140625, + "completion_length": 41.70833396911621, + "epoch": 3.224, + "grad_norm": 1.0540529518722053, + "kl": 0.2734375, "learning_rate": 3.364343018005057e-07, - "loss": 0.0001, - "reward": 0.6319787204265594, - "reward_std": 0.3149000033736229, - "rewards/correct_code_reward_func": 0.229166679084301, - "rewards/len_reward_func": 0.40281203389167786, + "loss": 0.0003, + "reward": 0.7291666865348816, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.2291666716337204, + "rewards/len_reward_func": 0.5, "step": 203 }, { - "completion_length": 41.16666793823242, - "epoch": 3.288, - "grad_norm": 0.718338279396836, - "kl": 0.11279296875, + "completion_length": 22.187500953674316, + "epoch": 3.24, + "grad_norm": 7.138753075856611, + "kl": 0.3388671875, "learning_rate": 3.349129862460251e-07, - "loss": 0.0001, - "reward": 0.5834802687168121, - "reward_std": 0.273087527602911, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.45848025381565094, + "loss": 0.0003, + "reward": 0.9375, + "reward_std": 0.25249695032835007, + "rewards/correct_code_reward_func": 0.4375, + "rewards/len_reward_func": 0.5, "step": 204 }, { - "completion_length": 80.83333587646484, - "epoch": 3.304, - "grad_norm": 1.229612918435609, - "kl": 0.106201171875, + "completion_length": 22.562500953674316, + "epoch": 3.2560000000000002, + "grad_norm": 2.174154407115016, + "kl": 0.3720703125, "learning_rate": 3.3338810791270517e-07, - "loss": 0.0001, - "reward": 0.605635017156601, - "reward_std": 0.32427075505256653, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.43896833062171936, + "loss": 0.0004, + "reward": 0.8958333432674408, + "reward_std": 0.3205290399491787, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.5, "step": 205 }, { - "completion_length": 64.41666793823242, - "epoch": 3.32, - "grad_norm": 0.7802466892306293, - "kl": 0.093994140625, + "completion_length": 26.770834922790527, + "epoch": 3.2720000000000002, + "grad_norm": 1.5444039155167695, + "kl": 0.490234375, "learning_rate": 3.318597307813866e-07, - "loss": 0.0001, - "reward": 0.4557005316019058, - "reward_std": 0.16802169382572174, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.414033904671669, + "loss": 0.0005, + "reward": 0.7916666865348816, + "reward_std": 0.2342708334326744, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.5, "step": 206 }, { - "completion_length": 59.875, - "epoch": 3.336, - "grad_norm": 0.9877985890101864, - "kl": 0.1240234375, + "completion_length": 28.000000953674316, + "epoch": 3.288, + "grad_norm": 4.365687287010453, + "kl": 0.349609375, "learning_rate": 3.3032791897971307e-07, - "loss": 0.0001, - "reward": 0.6161052584648132, - "reward_std": 0.2275523617863655, - "rewards/correct_code_reward_func": 0.1875000074505806, - "rewards/len_reward_func": 0.42860524356365204, + "loss": 0.0003, + "reward": 0.7916666865348816, + "reward_std": 0.2630349025130272, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.5, "step": 207 }, { - "completion_length": 45.25000190734863, - "epoch": 3.352, - "grad_norm": 1.8998950395698522, - "kl": 0.1328125, + "completion_length": 37.14583396911621, + "epoch": 3.304, + "grad_norm": 3.197536110678343, + "kl": 0.283203125, "learning_rate": 3.287927367794397e-07, - "loss": 0.0001, - "reward": 0.5865370333194733, - "reward_std": 0.3143910765647888, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.46153703331947327, + "loss": 0.0003, + "reward": 0.7973356544971466, + "reward_std": 0.21966809779405594, + "rewards/correct_code_reward_func": 0.3125000149011612, + "rewards/len_reward_func": 0.484835609793663, "step": 208 }, { - "completion_length": 54.291669845581055, - "epoch": 3.368, - "grad_norm": 0.7324049278355411, - "kl": 0.073974609375, + "completion_length": 32.35416793823242, + "epoch": 3.32, + "grad_norm": 3.3805334494300956, + "kl": 0.619140625, "learning_rate": 3.272542485937368e-07, - "loss": 0.0001, - "reward": 0.5458050072193146, - "reward_std": 0.21722080186009407, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.420804962515831, + "loss": 0.0006, + "reward": 0.7500000298023224, + "reward_std": 0.22233543917536736, + "rewards/correct_code_reward_func": 0.25000000558793545, + "rewards/len_reward_func": 0.5, "step": 209 }, { - "completion_length": 68.56250190734863, - "epoch": 3.384, - "grad_norm": 0.990518981109537, - "kl": 0.113525390625, + "completion_length": 22.4375, + "epoch": 3.336, + "grad_norm": 1.6447820848367736, + "kl": 1.8876953125, "learning_rate": 3.2571251897448763e-07, - "loss": 0.0001, - "reward": 0.5235690921545029, - "reward_std": 0.3244580924510956, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.3777357488870621, + "loss": 0.0019, + "reward": 1.0416666865348816, + "reward_std": 0.2994871288537979, + "rewards/correct_code_reward_func": 0.5416666865348816, + "rewards/len_reward_func": 0.5, "step": 210 }, { - "completion_length": 69.85416984558105, - "epoch": 3.4, - "grad_norm": 1.2284792953713894, - "kl": 0.13671875, + "completion_length": 27.5625, + "epoch": 3.352, + "grad_norm": 3.389054471629676, + "kl": 0.3701171875, "learning_rate": 3.241676126095792e-07, - "loss": 0.0001, - "reward": 0.6687899529933929, - "reward_std": 0.42763154208660126, - "rewards/correct_code_reward_func": 0.291666679084301, - "rewards/len_reward_func": 0.37712329626083374, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.42927365005016327, + "rewards/correct_code_reward_func": 0.5, + "rewards/len_reward_func": 0.5, "step": 211 }, { - "completion_length": 59.41666793823242, - "epoch": 3.416, - "grad_norm": 1.1415237482400722, - "kl": 0.061279296875, + "completion_length": 24.916666984558105, + "epoch": 3.368, + "grad_norm": 12.368626256137578, + "kl": 0.3408203125, "learning_rate": 3.226195943201883e-07, - "loss": 0.0001, - "reward": 0.6102548837661743, - "reward_std": 0.32314081490039825, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.4227548688650131, + "loss": 0.0003, + "reward": 0.8541666865348816, + "reward_std": 0.204109326004982, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.5, "step": 212 }, { - "completion_length": 48.72916793823242, - "epoch": 3.432, - "grad_norm": 0.8540792705237201, - "kl": 0.083740234375, + "completion_length": 25.58333396911621, + "epoch": 3.384, + "grad_norm": 1.1298161238486502, + "kl": 0.416015625, "learning_rate": 3.2106852905806216e-07, - "loss": 0.0001, - "reward": 0.6738589107990265, - "reward_std": 0.3664027154445648, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.4238588958978653, + "loss": 0.0004, + "reward": 0.9375000596046448, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.4375000149011612, + "rewards/len_reward_func": 0.5, "step": 213 }, { - "completion_length": 93.72916984558105, - "epoch": 3.448, - "grad_norm": 0.9851440838948831, - "kl": 0.078125, + "completion_length": 30.104167938232422, + "epoch": 3.4, + "grad_norm": 10.260020475970132, + "kl": 0.408203125, "learning_rate": 3.1951448190279253e-07, - "loss": 0.0001, - "reward": 0.5327434539794922, - "reward_std": 0.400749608874321, - "rewards/correct_code_reward_func": 0.1458333358168602, - "rewards/len_reward_func": 0.3869100958108902, + "loss": 0.0004, + "reward": 0.9583333432674408, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.5, "step": 214 }, { - "completion_length": 74.06250381469727, - "epoch": 3.464, - "grad_norm": 1.87568123955445, - "kl": 0.1295166015625, + "completion_length": 29.791666984558105, + "epoch": 3.416, + "grad_norm": 3.3670910458604015, + "kl": 0.37841796875, "learning_rate": 3.179575180590857e-07, - "loss": 0.0001, - "reward": 0.40193726122379303, - "reward_std": 0.24312910437583923, - "rewards/correct_code_reward_func": 0.0, - "rewards/len_reward_func": 0.40193726122379303, + "loss": 0.0004, + "reward": 0.7916666865348816, + "reward_std": 0.2840898931026459, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.5, "step": 215 }, { - "completion_length": 60.66666793823242, - "epoch": 3.48, - "grad_norm": 0.9226124021677337, - "kl": 0.083984375, + "completion_length": 22.916667938232422, + "epoch": 3.432, + "grad_norm": 8.332848830151702, + "kl": 0.3994140625, "learning_rate": 3.163977028540263e-07, - "loss": 0.0001, - "reward": 0.47024165093898773, - "reward_std": 0.24201779812574387, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.3869083374738693, + "loss": 0.0004, + "reward": 1.041666716337204, + "reward_std": 0.2630349025130272, + "rewards/correct_code_reward_func": 0.5416666865348816, + "rewards/len_reward_func": 0.5, "step": 216 }, { - "completion_length": 89.60416984558105, - "epoch": 3.496, - "grad_norm": 0.7326306108854702, - "kl": 0.095703125, + "completion_length": 28.854167938232422, + "epoch": 3.448, + "grad_norm": 1.29269474567732, + "kl": 0.41015625, "learning_rate": 3.1483510173433627e-07, - "loss": 0.0001, - "reward": 0.6562305092811584, - "reward_std": 0.45491674542427063, - "rewards/correct_code_reward_func": 0.2708333358168602, - "rewards/len_reward_func": 0.38539716601371765, + "loss": 0.0004, + "reward": 0.9166666865348816, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.4166666716337204, + "rewards/len_reward_func": 0.5, "step": 217 }, { - "completion_length": 93.91666793823242, - "epoch": 3.512, - "grad_norm": 1.4962557483369903, - "kl": 0.169921875, + "completion_length": 55.583335876464844, + "epoch": 3.464, + "grad_norm": 1.1956536549620946, + "kl": 1.72021484375, "learning_rate": 3.1326978026362905e-07, - "loss": 0.0002, - "reward": 0.5683007538318634, - "reward_std": 0.3712882995605469, - "rewards/correct_code_reward_func": 0.1666666679084301, - "rewards/len_reward_func": 0.4016340672969818, + "loss": 0.0017, + "reward": 0.5416666865348816, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.06250000186264515, + "rewards/len_reward_func": 0.4791666716337204, "step": 218 }, { - "completion_length": 47.70833396911621, - "epoch": 3.528, - "grad_norm": 0.5846408335917113, - "kl": 0.155517578125, + "completion_length": 32.66666793823242, + "epoch": 3.48, + "grad_norm": 7.474594998118115, + "kl": 1.87890625, "learning_rate": 3.1170180411965854e-07, - "loss": 0.0002, - "reward": 0.5376704931259155, - "reward_std": 0.35435017943382263, - "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.4126705080270767, + "loss": 0.0019, + "reward": 0.6458333432674408, + "reward_std": 0.13607725501060486, + "rewards/correct_code_reward_func": 0.1458333432674408, + "rewards/len_reward_func": 0.5, "step": 219 }, { - "completion_length": 80.33333587646484, - "epoch": 3.544, - "grad_norm": 0.953271532334556, - "kl": 0.15673828125, + "completion_length": 33.93750190734863, + "epoch": 3.496, + "grad_norm": 0.14003358731788035, + "kl": 0.5478515625, "learning_rate": 3.101312390915634e-07, - "loss": 0.0002, - "reward": 0.5383496731519699, - "reward_std": 0.32219837605953217, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4341830015182495, + "loss": 0.0005, + "reward": 0.916666716337204, + "reward_std": 0.15430335700511932, + "rewards/correct_code_reward_func": 0.416666679084301, + "rewards/len_reward_func": 0.5, "step": 220 }, { - "completion_length": 82.39583587646484, - "epoch": 3.56, - "grad_norm": 1.06493255865889, - "kl": 0.1982421875, + "completion_length": 46.0625, + "epoch": 3.512, + "grad_norm": 3.45663103404313, + "kl": 0.2705078125, "learning_rate": 3.0855815107710665e-07, - "loss": 0.0002, - "reward": 0.5354643762111664, - "reward_std": 0.3206261843442917, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.431297704577446, + "loss": 0.0003, + "reward": 0.8541666865348816, + "reward_std": 0.28126102685928345, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.5, "step": 221 }, { - "completion_length": 52.66666793823242, - "epoch": 3.576, - "grad_norm": 0.8941217236697588, - "kl": 0.165283203125, + "completion_length": 26.375000953674316, + "epoch": 3.528, + "grad_norm": 1.4588393763010847, + "kl": 0.861328125, "learning_rate": 3.069826060799109e-07, - "loss": 0.0002, - "reward": 0.5823579728603363, - "reward_std": 0.49040672183036804, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.3740246146917343, + "loss": 0.0009, + "reward": 0.7708333432674408, + "reward_std": 0.28126101940870285, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.5, "step": 222 }, { - "completion_length": 46.31250190734863, - "epoch": 3.592, - "grad_norm": 1.4630933649290316, - "kl": 0.1513671875, + "completion_length": 26.479167938232422, + "epoch": 3.544, + "grad_norm": 3.8929614164173505, + "kl": 0.798828125, "learning_rate": 3.054046702066886e-07, - "loss": 0.0002, - "reward": 0.5546734929084778, - "reward_std": 0.19852039963006973, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4505068510770798, + "loss": 0.0008, + "reward": 0.8750000596046448, + "reward_std": 0.19500280916690826, + "rewards/correct_code_reward_func": 0.3750000111758709, + "rewards/len_reward_func": 0.5, "step": 223 }, { - "completion_length": 60.458335876464844, - "epoch": 3.608, - "grad_norm": 1.3998397725336895, - "kl": 0.14404296875, + "completion_length": 37.75000190734863, + "epoch": 3.56, + "grad_norm": 1.3984016713523089, + "kl": 0.29345703125, "learning_rate": 3.038244096644687e-07, - "loss": 0.0001, - "reward": 0.6969195306301117, - "reward_std": 0.3739180713891983, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.4469195753335953, + "loss": 0.0003, + "reward": 0.7500000298023224, + "reward_std": 0.15430335700511932, + "rewards/correct_code_reward_func": 0.2500000111758709, + "rewards/len_reward_func": 0.5, "step": 224 }, { - "completion_length": 56.083335876464844, - "epoch": 3.624, - "grad_norm": 3.4831731856592327, - "kl": 0.1337890625, + "completion_length": 23.541666984558105, + "epoch": 3.576, + "grad_norm": 1.132022810723016, + "kl": 0.4326171875, "learning_rate": 3.022418907578188e-07, - "loss": 0.0001, - "reward": 0.6825621128082275, - "reward_std": 0.4716331660747528, - "rewards/correct_code_reward_func": 0.291666679084301, - "rewards/len_reward_func": 0.39089545607566833, + "loss": 0.0004, + "reward": 0.8958333432674408, + "reward_std": 0.264432355761528, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.5, "step": 225 }, { - "completion_length": 46.72916793823242, - "epoch": 3.64, - "grad_norm": 1.144810736242952, - "kl": 0.09521484375, + "completion_length": 27.666666984558105, + "epoch": 3.592, + "grad_norm": 0.9291842503741052, + "kl": 0.4638671875, "learning_rate": 3.0065717988606256e-07, - "loss": 0.0001, - "reward": 0.5482343584299088, - "reward_std": 0.29667874425649643, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.42323435842990875, + "loss": 0.0005, + "reward": 0.7500000298023224, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.2500000149011612, + "rewards/len_reward_func": 0.5, "step": 226 }, { - "completion_length": 59.29166793823242, - "epoch": 3.656, - "grad_norm": 0.9083666113017377, - "kl": 0.1201171875, + "completion_length": 37.0625, + "epoch": 3.608, + "grad_norm": 2.994489684005205, + "kl": 0.58984375, "learning_rate": 2.990703435404944e-07, - "loss": 0.0001, - "reward": 0.551034688949585, - "reward_std": 0.24114000797271729, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.42603468894958496, + "loss": 0.0006, + "reward": 1.1458333730697632, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.6458333432674408, + "rewards/len_reward_func": 0.5, "step": 227 }, { - "completion_length": 56.64583396911621, - "epoch": 3.672, - "grad_norm": 2.4363015768861653, - "kl": 0.1376953125, + "completion_length": 17.479166984558105, + "epoch": 3.624, + "grad_norm": 25.48314385427609, + "kl": 5.3251953125, "learning_rate": 2.974814483015892e-07, - "loss": 0.0001, - "reward": 0.6207046508789062, - "reward_std": 0.3144160062074661, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.43320460617542267, + "loss": 0.0053, + "reward": 1.2291666865348816, + "reward_std": 0.3857453167438507, + "rewards/correct_code_reward_func": 0.7291666865348816, + "rewards/len_reward_func": 0.5, "step": 228 }, { - "completion_length": 99.4375, - "epoch": 3.6879999999999997, - "grad_norm": 1.8003763928920427, - "kl": 0.25341796875, + "completion_length": 25.64583396911621, + "epoch": 3.64, + "grad_norm": 5.331589909011615, + "kl": 0.4599609375, "learning_rate": 2.95890560836209e-07, - "loss": 0.0003, - "reward": 0.7397480607032776, - "reward_std": 0.4180787652730942, - "rewards/correct_code_reward_func": 0.2916666716337204, - "rewards/len_reward_func": 0.4480813890695572, + "loss": 0.0005, + "reward": 0.8333333730697632, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.3333333544433117, + "rewards/len_reward_func": 0.5, "step": 229 }, { - "completion_length": 53.1875, - "epoch": 3.7039999999999997, - "grad_norm": 1.6649838206036085, - "kl": 0.27783203125, + "completion_length": 39.27083396911621, + "epoch": 3.656, + "grad_norm": 0.9614447916816511, + "kl": 0.29052734375, "learning_rate": 2.942977478948057e-07, "loss": 0.0003, - "reward": 0.5099760442972183, - "reward_std": 0.08963469415903091, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.46830935776233673, + "reward": 1.0208333730697632, + "reward_std": 0.22516431659460068, + "rewards/correct_code_reward_func": 0.5416666865348816, + "rewards/len_reward_func": 0.4791666716337204, "step": 230 }, { - "completion_length": 69.68750381469727, - "epoch": 3.7199999999999998, - "grad_norm": 2.2866034284580103, - "kl": 0.265625, + "completion_length": 37.62500190734863, + "epoch": 3.672, + "grad_norm": 1.6189285355243652, + "kl": 82.1376953125, "learning_rate": 2.9270307630862006e-07, - "loss": 0.0003, - "reward": 0.4825073331594467, - "reward_std": 0.24035291373729706, - "rewards/correct_code_reward_func": 0.0416666679084301, - "rewards/len_reward_func": 0.4408406764268875, + "loss": 0.0816, + "reward": 0.875, + "reward_std": 0.4159068316221237, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.5, "step": 231 }, { - "completion_length": 35.854166984558105, - "epoch": 3.7359999999999998, - "grad_norm": 1.889290275688488, - "kl": 0.27587890625, + "completion_length": 37.64583396911621, + "epoch": 3.6879999999999997, + "grad_norm": 144.3039978026888, + "kl": 0.255859375, "learning_rate": 2.911066129868782e-07, "loss": 0.0003, - "reward": 0.6243129521608353, - "reward_std": 0.4304501414299011, - "rewards/correct_code_reward_func": 0.2083333395421505, - "rewards/len_reward_func": 0.41597960889339447, + "reward": 1.0625, + "reward_std": 0.1480126492679119, + "rewards/correct_code_reward_func": 0.5625, + "rewards/len_reward_func": 0.5, "step": 232 }, { - "completion_length": 59.62500190734863, - "epoch": 3.752, - "grad_norm": 2.126125346865293, - "kl": 0.095947265625, + "completion_length": 28.916667938232422, + "epoch": 3.7039999999999997, + "grad_norm": 3.7202345670411265, + "kl": 0.3232421875, "learning_rate": 2.8950842491398355e-07, - "loss": 0.0001, - "reward": 0.6098466515541077, - "reward_std": 0.21379615366458893, + "loss": 0.0003, + "reward": 0.6250000298023224, + "reward_std": 0.22233543917536736, "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.48484663665294647, + "rewards/len_reward_func": 0.5, "step": 233 }, { - "completion_length": 70.27083587646484, - "epoch": 3.768, - "grad_norm": 1.072869194296466, - "kl": 0.1259765625, + "completion_length": 46.83333492279053, + "epoch": 3.7199999999999998, + "grad_norm": 1.2743344086109105, + "kl": 0.2841796875, "learning_rate": 2.87908579146707e-07, - "loss": 0.0001, - "reward": 0.6814843565225601, - "reward_std": 0.35157327353954315, - "rewards/correct_code_reward_func": 0.25000000558793545, - "rewards/len_reward_func": 0.4314843565225601, + "loss": 0.0003, + "reward": 0.7083333730697632, + "reward_std": 0.19500280916690826, + "rewards/correct_code_reward_func": 0.2083333395421505, + "rewards/len_reward_func": 0.5, "step": 234 }, { - "completion_length": 68.85416793823242, - "epoch": 3.784, - "grad_norm": 0.8470810337567637, - "kl": 0.18798828125, + "completion_length": 27.25, + "epoch": 3.7359999999999998, + "grad_norm": 10.188313773698406, + "kl": 8.3095703125, "learning_rate": 2.863071428113726e-07, - "loss": 0.0002, - "reward": 0.4267548620700836, - "reward_std": 0.1786961741745472, - "rewards/correct_code_reward_func": 0.0, - "rewards/len_reward_func": 0.4267548620700836, + "loss": 0.0083, + "reward": 1.0833333432674408, + "reward_std": 0.08908708393573761, + "rewards/correct_code_reward_func": 0.5833333358168602, + "rewards/len_reward_func": 0.5, "step": 235 }, { - "completion_length": 56.500003814697266, - "epoch": 3.8, - "grad_norm": 0.4268990754916049, - "kl": 0.2025146484375, + "completion_length": 32.62500190734863, + "epoch": 3.752, + "grad_norm": 4.109218918644723, + "kl": 1.931640625, "learning_rate": 2.847041831010417e-07, - "loss": 0.0002, - "reward": 0.4583333432674408, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.0, - "rewards/len_reward_func": 0.4583333432674408, + "loss": 0.0019, + "reward": 0.8233599662780762, + "reward_std": 0.18251240625977516, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.490026593208313, "step": 236 }, { - "completion_length": 51.8125, - "epoch": 3.816, - "grad_norm": 1.8148712071173405, - "kl": 0.1826171875, + "completion_length": 31.083334922790527, + "epoch": 3.768, + "grad_norm": 1.8606521647165692, + "kl": 0.73828125, "learning_rate": 2.830997672726933e-07, - "loss": 0.0002, - "reward": 0.6326505243778229, - "reward_std": 0.23830580711364746, - "rewards/correct_code_reward_func": 0.1875000074505806, - "rewards/len_reward_func": 0.4451505243778229, + "loss": 0.0007, + "reward": 1.1041666865348816, + "reward_std": 0.30859362706542015, + "rewards/correct_code_reward_func": 0.6041666865348816, + "rewards/len_reward_func": 0.5, "step": 237 }, { - "completion_length": 37.4375, - "epoch": 3.832, - "grad_norm": 1.6952658159559488, - "kl": 0.2041015625, + "completion_length": 55.354169845581055, + "epoch": 3.784, + "grad_norm": 2.734615110690433, + "kl": 0.24951171875, "learning_rate": 2.8149396264440227e-07, - "loss": 0.0002, - "reward": 0.5818339586257935, - "reward_std": 0.3086412772536278, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.43600064516067505, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.0, + "rewards/len_reward_func": 0.5, "step": 238 }, { - "completion_length": 42.93750190734863, - "epoch": 3.848, - "grad_norm": 1.1881168537898452, - "kl": 0.1728515625, + "completion_length": 41.291666984558105, + "epoch": 3.8, + "grad_norm": 0.012449679074169549, + "kl": 0.2978515625, "learning_rate": 2.798868365925147e-07, - "loss": 0.0002, - "reward": 0.4946992099285126, - "reward_std": 0.1176161989569664, - "rewards/correct_code_reward_func": 0.02083333395421505, - "rewards/len_reward_func": 0.4738658666610718, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.0, + "rewards/len_reward_func": 0.5, "step": 239 }, { - "completion_length": 48.604169845581055, - "epoch": 3.864, - "grad_norm": 1.3112500423790903, - "kl": 0.1962890625, + "completion_length": 36.77083492279053, + "epoch": 3.816, + "grad_norm": 7.797923174364327, + "kl": 0.3642578125, "learning_rate": 2.782784565488211e-07, - "loss": 0.0002, - "reward": 0.7240410149097443, - "reward_std": 0.4527997225522995, - "rewards/correct_code_reward_func": 0.2916666865348816, - "rewards/len_reward_func": 0.43237435817718506, + "loss": 0.0004, + "reward": 0.7975983917713165, + "reward_std": 0.10107371583580971, + "rewards/correct_code_reward_func": 0.3125000149011612, + "rewards/len_reward_func": 0.48509839177131653, "step": 240 }, { - "completion_length": 37.66666793823242, - "epoch": 3.88, - "grad_norm": 5.549976044339671, - "kl": 0.27685546875, + "completion_length": 28.375000953674316, + "epoch": 3.832, + "grad_norm": 2.477116298025544, + "kl": 0.24755859375, "learning_rate": 2.7666888999772656e-07, - "loss": 0.0003, - "reward": 0.7916666865348816, - "reward_std": 0.39485183358192444, - "rewards/correct_code_reward_func": 0.2916666716337204, + "loss": 0.0002, + "reward": 0.9375000596046448, + "reward_std": 0.2587745860219002, + "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.5, "step": 241 }, { - "completion_length": 82.60416984558105, - "epoch": 3.896, - "grad_norm": 1.8188793787787139, - "kl": 0.265625, + "completion_length": 30.416666984558105, + "epoch": 3.848, + "grad_norm": 0.9811120055537635, + "kl": 9.095703125, "learning_rate": 2.7505820447342024e-07, - "loss": 0.0003, - "reward": 0.6013766527175903, - "reward_std": 0.3011641651391983, + "loss": 0.0091, + "reward": 0.6666666865348816, + "reward_std": 0.0, "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.43470996618270874, + "rewards/len_reward_func": 0.5, "step": 242 }, { - "completion_length": 60.145835876464844, - "epoch": 3.912, - "grad_norm": 1.898648967822129, - "kl": 0.2353515625, + "completion_length": 19.39583396911621, + "epoch": 3.864, + "grad_norm": 15.396866385585833, + "kl": 1.05078125, "learning_rate": 2.7344646755704073e-07, - "loss": 0.0002, - "reward": 0.5906525254249573, - "reward_std": 0.46274301409721375, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.4239858388900757, + "loss": 0.001, + "reward": 1.1875000596046448, + "reward_std": 0.3219604417681694, + "rewards/correct_code_reward_func": 0.6875000298023224, + "rewards/len_reward_func": 0.5, "step": 243 }, { - "completion_length": 61.6875, - "epoch": 3.928, - "grad_norm": 1.328276177248427, - "kl": 0.22216796875, + "completion_length": 23.041667461395264, + "epoch": 3.88, + "grad_norm": 12.447948337219644, + "kl": 0.421875, "learning_rate": 2.7183374687384096e-07, - "loss": 0.0002, - "reward": 0.6718549132347107, - "reward_std": 0.26345258578658104, - "rewards/correct_code_reward_func": 0.2291666679084301, - "rewards/len_reward_func": 0.4426882416009903, + "loss": 0.0004, + "reward": 1.0000000596046448, + "reward_std": 0.19500280916690826, + "rewards/correct_code_reward_func": 0.5000000111758709, + "rewards/len_reward_func": 0.5, "step": 244 }, { - "completion_length": 42.0625, - "epoch": 3.944, - "grad_norm": 1.3774347106449945, - "kl": 0.19189453125, + "completion_length": 43.3125, + "epoch": 3.896, + "grad_norm": 1.581341578871288, + "kl": 0.2802734375, "learning_rate": 2.7022011009035107e-07, - "loss": 0.0002, - "reward": 0.7021766602993011, - "reward_std": 0.42118969559669495, - "rewards/correct_code_reward_func": 0.2500000111758709, - "rewards/len_reward_func": 0.45217666029930115, + "loss": 0.0003, + "reward": 0.8125000298023224, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.3125000149011612, + "rewards/len_reward_func": 0.5, "step": 245 }, { - "completion_length": 54.395835876464844, - "epoch": 3.96, - "grad_norm": 2.7965232874695656, - "kl": 0.1689453125, + "completion_length": 34.208335876464844, + "epoch": 3.912, + "grad_norm": 0.9152208180228742, + "kl": 2.37890625, "learning_rate": 2.686056249115385e-07, - "loss": 0.0002, - "reward": 0.6701188087463379, - "reward_std": 0.34179161489009857, - "rewards/correct_code_reward_func": 0.2291666716337204, - "rewards/len_reward_func": 0.4409521520137787, + "loss": 0.0024, + "reward": 0.9791666865348816, + "reward_std": 0.4748324006795883, + "rewards/correct_code_reward_func": 0.4791666865348816, + "rewards/len_reward_func": 0.5, "step": 246 }, { - "completion_length": 47.95833396911621, - "epoch": 3.976, - "grad_norm": 1.1532787051667712, - "kl": 0.26318359375, + "completion_length": 35.83333492279053, + "epoch": 3.928, + "grad_norm": 1.5451820654194635, + "kl": 0.513671875, "learning_rate": 2.669903590779679e-07, - "loss": 0.0003, - "reward": 0.8034215867519379, - "reward_std": 0.33907921612262726, - "rewards/correct_code_reward_func": 0.354166679084301, - "rewards/len_reward_func": 0.44925491511821747, + "loss": 0.0005, + "reward": 0.7083333730697632, + "reward_std": 0.19500280916690826, + "rewards/correct_code_reward_func": 0.2291666679084301, + "rewards/len_reward_func": 0.4791666716337204, "step": 247 }, { - "completion_length": 55.29166793823242, - "epoch": 3.992, - "grad_norm": 1.2590383713445363, - "kl": 0.19091796875, + "completion_length": 23.625000953674316, + "epoch": 3.944, + "grad_norm": 3.4351450173628177, + "kl": 0.419921875, "learning_rate": 2.653743803629587e-07, - "loss": 0.0002, - "reward": 0.7690662443637848, - "reward_std": 0.26584791392087936, - "rewards/correct_code_reward_func": 0.3125000149011612, - "rewards/len_reward_func": 0.4565662145614624, + "loss": 0.0004, + "reward": 1.1250000298023224, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.6250000298023224, + "rewards/len_reward_func": 0.5, "step": 248 }, { - "completion_length": 57.125, - "epoch": 4.0, - "grad_norm": 1.1116164246956617, - "kl": 0.11669921875, + "completion_length": 22.916666984558105, + "epoch": 3.96, + "grad_norm": 4.124748952346768, + "kl": 0.484375, "learning_rate": 2.637577565697412e-07, - "loss": 0.0001, - "reward": 0.6158446073532104, - "reward_std": 0.25658977031707764, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.44917792081832886, + "loss": 0.0005, + "reward": 0.9583333432674408, + "reward_std": 0.2994871288537979, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.5, "step": 249 }, { - "completion_length": 53.45833396911621, - "epoch": 4.016, - "grad_norm": 0.8013179043832869, - "kl": 0.3369140625, + "completion_length": 26.437500953674316, + "epoch": 3.976, + "grad_norm": 7.576926136134958, + "kl": 0.8662109375, "learning_rate": 2.621405555286121e-07, - "loss": 0.0003, - "reward": 0.5910404771566391, - "reward_std": 0.2622908353805542, - "rewards/correct_code_reward_func": 0.1458333432674408, - "rewards/len_reward_func": 0.4452071338891983, + "loss": 0.0009, + "reward": 1.0833333730697632, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.583333358168602, + "rewards/len_reward_func": 0.5, "step": 250 }, { - "completion_length": 48.50000190734863, - "epoch": 4.032, - "grad_norm": 1.670175887558562, - "kl": 0.14404296875, + "completion_length": 24.89583396911621, + "epoch": 3.992, + "grad_norm": 19.398267903289618, + "kl": 0.6259765625, "learning_rate": 2.60522845094088e-07, - "loss": 0.0001, - "reward": 0.7500000298023224, - "reward_std": 0.31142251193523407, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0006, + "reward": 1.1458333730697632, + "reward_std": 0.1767766959965229, + "rewards/correct_code_reward_func": 0.6458333432674408, + "rewards/len_reward_func": 0.5, "step": 251 }, { - "completion_length": 72.25000381469727, - "epoch": 4.048, - "grad_norm": 1.0094904596835066, - "kl": 0.19287109375, + "completion_length": 28.625, + "epoch": 4.0, + "grad_norm": 4.9899585795469665, + "kl": 0.79296875, "learning_rate": 2.589046931420589e-07, - "loss": 0.0002, - "reward": 0.567308098077774, - "reward_std": 0.3976233899593353, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.40064144134521484, + "loss": 0.0004, + "reward": 0.8333333730697632, + "reward_std": 0.34503278136253357, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, "step": 252 }, { - "completion_length": 47.416669845581055, - "epoch": 4.064, - "grad_norm": 1.3183921866728252, - "kl": 0.13232421875, + "completion_length": 28.89583396911621, + "epoch": 4.016, + "grad_norm": 3.4544081031205565, + "kl": 0.3349609375, "learning_rate": 2.572861675669399e-07, - "loss": 0.0001, - "reward": 0.7874628305435181, - "reward_std": 0.2574521154165268, - "rewards/correct_code_reward_func": 0.2916666716337204, - "rewards/len_reward_func": 0.4957961291074753, + "loss": 0.0003, + "reward": 0.7500000298023224, + "reward_std": 0.24966806918382645, + "rewards/correct_code_reward_func": 0.2500000111758709, + "rewards/len_reward_func": 0.5, "step": 253 }, { - "completion_length": 53.583335876464844, - "epoch": 4.08, - "grad_norm": 0.7061144910500725, - "kl": 0.359375, + "completion_length": 27.354167938232422, + "epoch": 4.032, + "grad_norm": 5.360212409444759, + "kl": 8.87109375, "learning_rate": 2.556673362788225e-07, - "loss": 0.0004, - "reward": 0.5823437571525574, - "reward_std": 0.25689636170864105, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.4573437422513962, + "loss": 0.0089, + "reward": 0.8750000298023224, + "reward_std": 0.2721545025706291, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.5, "step": 254 }, { - "completion_length": 41.31250190734863, - "epoch": 4.096, - "grad_norm": 2.791651126736062, - "kl": 0.174560546875, + "completion_length": 56.958335876464844, + "epoch": 4.048, + "grad_norm": 4.337364120588155, + "kl": 0.26171875, "learning_rate": 2.540482672006254e-07, - "loss": 0.0002, - "reward": 0.8784991502761841, - "reward_std": 0.5018954128026962, - "rewards/correct_code_reward_func": 0.4583333432674408, - "rewards/len_reward_func": 0.4201658070087433, + "loss": 0.0003, + "reward": 0.8750000298023224, + "reward_std": 0.2553258389234543, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.4791666716337204, "step": 255 }, { - "completion_length": 40.41666793823242, - "epoch": 4.112, - "grad_norm": 1.2214952182858416, - "kl": 0.37158203125, + "completion_length": 38.14583492279053, + "epoch": 4.064, + "grad_norm": 5.937461009037335, + "kl": 0.2958984375, "learning_rate": 2.524290282652443e-07, - "loss": 0.0004, - "reward": 0.6296601295471191, - "reward_std": 0.3699600249528885, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.44216009974479675, + "loss": 0.0003, + "reward": 0.9583333432674408, + "reward_std": 0.2357022576034069, + "rewards/correct_code_reward_func": 0.4791666716337204, + "rewards/len_reward_func": 0.4791666716337204, "step": 256 }, { - "completion_length": 37.04166793823242, - "epoch": 4.128, - "grad_norm": 0.7476384667208398, - "kl": 0.28466796875, + "completion_length": 40.70833396911621, + "epoch": 4.08, + "grad_norm": 6.960976871582724, + "kl": 0.28515625, "learning_rate": 2.508096874127022e-07, "loss": 0.0003, - "reward": 0.7083333730697632, - "reward_std": 0.2428889200091362, - "rewards/correct_code_reward_func": 0.2291666679084301, - "rewards/len_reward_func": 0.4791666716337204, + "reward": 1.0416666865348816, + "reward_std": 0.3177001625299454, + "rewards/correct_code_reward_func": 0.5416666865348816, + "rewards/len_reward_func": 0.5, "step": 257 }, { - "completion_length": 47.43750190734863, - "epoch": 4.144, - "grad_norm": 1.126554313392494, - "kl": 0.143798828125, + "completion_length": 22.58333396911621, + "epoch": 4.096, + "grad_norm": 3.64669977435422, + "kl": 0.2900390625, "learning_rate": 2.4919031258729785e-07, - "loss": 0.0001, - "reward": 0.9141716659069061, - "reward_std": 0.5029458105564117, - "rewards/correct_code_reward_func": 0.4375000149011612, - "rewards/len_reward_func": 0.47667166590690613, + "loss": 0.0003, + "reward": 1.1250000596046448, + "reward_std": 0.2314550280570984, + "rewards/correct_code_reward_func": 0.6250000298023224, + "rewards/len_reward_func": 0.5, "step": 258 }, { - "completion_length": 39.25, - "epoch": 4.16, - "grad_norm": 2.4029046394268923, - "kl": 0.2353515625, + "completion_length": 34.83333492279053, + "epoch": 4.112, + "grad_norm": 644.251002590668, + "kl": 290.12451171875, "learning_rate": 2.475709717347557e-07, - "loss": 0.0002, - "reward": 1.0325255393981934, - "reward_std": 0.3972947895526886, - "rewards/correct_code_reward_func": 0.5416666865348816, - "rewards/len_reward_func": 0.4908588379621506, + "loss": 0.2906, + "reward": 0.7708333432674408, + "reward_std": 0.2041093371808529, + "rewards/correct_code_reward_func": 0.2708333432674408, + "rewards/len_reward_func": 0.5, "step": 259 }, { - "completion_length": 44.33333492279053, - "epoch": 4.176, - "grad_norm": 2.0184703022532693, - "kl": 0.345703125, + "completion_length": 26.83333396911621, + "epoch": 4.128, + "grad_norm": 11.440118730433715, + "kl": 1.41796875, "learning_rate": 2.459517327993746e-07, - "loss": 0.0003, - "reward": 0.7508440315723419, - "reward_std": 0.36033160239458084, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.48001065850257874, + "loss": 0.0014, + "reward": 0.9375000298023224, + "reward_std": 0.28126100450754166, + "rewards/correct_code_reward_func": 0.4375000298023224, + "rewards/len_reward_func": 0.5, "step": 260 }, { - "completion_length": 36.833335876464844, - "epoch": 4.192, - "grad_norm": 1.1572076654969774, - "kl": 0.19970703125, + "completion_length": 29.520834922790527, + "epoch": 4.144, + "grad_norm": 179.75006965593022, + "kl": 104.625, "learning_rate": 2.443326637211775e-07, - "loss": 0.0002, - "reward": 0.6571691036224365, - "reward_std": 0.24022644013166428, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.4696691334247589, + "loss": 0.1045, + "reward": 1.1041667461395264, + "reward_std": 0.22516432404518127, + "rewards/correct_code_reward_func": 0.6041666865348816, + "rewards/len_reward_func": 0.5, "step": 261 }, { - "completion_length": 59.687503814697266, - "epoch": 4.208, - "grad_norm": 1.163189022531501, - "kl": 0.4091796875, + "completion_length": 20.33333396911621, + "epoch": 4.16, + "grad_norm": 2.132132107066578, + "kl": 0.3193359375, "learning_rate": 2.427138324330601e-07, - "loss": 0.0004, - "reward": 0.5650826990604401, - "reward_std": 0.2466234788298607, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.4817493408918381, + "loss": 0.0003, + "reward": 1.4583333730697632, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.9583333730697632, + "rewards/len_reward_func": 0.5, "step": 262 }, { - "completion_length": 35.16666793823242, - "epoch": 4.224, - "grad_norm": 2.3259946307073567, - "kl": 0.3623046875, + "completion_length": 29.812501430511475, + "epoch": 4.176, + "grad_norm": 157.34801225037154, + "kl": 62.5693359375, "learning_rate": 2.4109530685794106e-07, - "loss": 0.0004, - "reward": 0.7291666865348816, - "reward_std": 0.21322892233729362, - "rewards/correct_code_reward_func": 0.229166679084301, - "rewards/len_reward_func": 0.5, + "loss": 0.0626, + "reward": 1.0833333730697632, + "reward_std": 0.2630349025130272, + "rewards/correct_code_reward_func": 0.6041666865348816, + "rewards/len_reward_func": 0.4791666716337204, "step": 263 }, { - "completion_length": 70.04166793823242, - "epoch": 4.24, - "grad_norm": 1.4718996945337304, - "kl": 0.3134765625, + "completion_length": 25.89583396911621, + "epoch": 4.192, + "grad_norm": 4.26239211159855, + "kl": 0.5234375, "learning_rate": 2.3947715490591203e-07, - "loss": 0.0003, - "reward": 0.5781983733177185, - "reward_std": 0.23685935139656067, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4740317016839981, + "loss": 0.0005, + "reward": 0.875, + "reward_std": 0.36751921474933624, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.5, "step": 264 }, { - "completion_length": 100.83333587646484, - "epoch": 4.256, - "grad_norm": 2.171828650408951, - "kl": 0.8681640625, + "completion_length": 41.979169845581055, + "epoch": 4.208, + "grad_norm": 1.0123222669357814, + "kl": 0.27734375, "learning_rate": 2.37859444471388e-07, - "loss": 0.0009, - "reward": 0.4322066307067871, - "reward_std": 0.19174856692552567, - "rewards/correct_code_reward_func": 0.0, - "rewards/len_reward_func": 0.4322066307067871, + "loss": 0.0003, + "reward": 0.6458333432674408, + "reward_std": 0.24056155234575272, + "rewards/correct_code_reward_func": 0.1458333358168602, + "rewards/len_reward_func": 0.5, "step": 265 }, { - "completion_length": 46.3125, - "epoch": 4.272, - "grad_norm": 1.1392116034939896, - "kl": 0.2216796875, + "completion_length": 27.854166984558105, + "epoch": 4.224, + "grad_norm": 117.90954129112552, + "kl": 75.134765625, "learning_rate": 2.3624224343025876e-07, - "loss": 0.0002, - "reward": 0.6220839321613312, - "reward_std": 0.2598780319094658, - "rewards/correct_code_reward_func": 0.1250000037252903, - "rewards/len_reward_func": 0.4970839321613312, + "loss": 0.0751, + "reward": 0.7291666865348816, + "reward_std": 0.16340987384319305, + "rewards/correct_code_reward_func": 0.2291666716337204, + "rewards/len_reward_func": 0.5, "step": 266 }, { - "completion_length": 73.91666793823242, - "epoch": 4.288, - "grad_norm": 1.106073215720508, - "kl": 0.2255859375, + "completion_length": 39.5625, + "epoch": 4.24, + "grad_norm": 5.1933183547730986, + "kl": 1.02783203125, "learning_rate": 2.346256196370413e-07, - "loss": 0.0002, - "reward": 0.6419270932674408, - "reward_std": 0.2151578813791275, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.4752604216337204, + "loss": 0.001, + "reward": 0.6837384402751923, + "reward_std": 0.34314342588186264, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.47540509700775146, "step": 267 }, { - "completion_length": 43.33333396911621, - "epoch": 4.304, - "grad_norm": 0.9488106305498838, - "kl": 0.1796875, + "completion_length": 92.64583587646484, + "epoch": 4.256, + "grad_norm": 0.9859859289357618, + "kl": 0.79638671875, "learning_rate": 2.3300964092203203e-07, - "loss": 0.0002, - "reward": 0.8300662040710449, - "reward_std": 0.42881861329078674, - "rewards/correct_code_reward_func": 0.375, - "rewards/len_reward_func": 0.4550662636756897, + "loss": 0.0008, + "reward": 0.5416666865348816, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.5, "step": 268 }, { - "completion_length": 98.20833587646484, - "epoch": 4.32, - "grad_norm": 0.5329470108796975, - "kl": 0.1376953125, + "completion_length": 34.89583396911621, + "epoch": 4.272, + "grad_norm": 16.913050019636458, + "kl": 16.21875, "learning_rate": 2.3139437508846152e-07, - "loss": 0.0001, - "reward": 0.6460709869861603, - "reward_std": 0.25160279124975204, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.4794043153524399, + "loss": 0.0162, + "reward": 0.5208333432674408, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.02083333395421505, + "rewards/len_reward_func": 0.5, "step": 269 }, { - "completion_length": 74.68750190734863, - "epoch": 4.336, - "grad_norm": 0.82995457588531, - "kl": 0.173095703125, + "completion_length": 58.29166793823242, + "epoch": 4.288, + "grad_norm": 1.2524162935664223, + "kl": 0.302734375, "learning_rate": 2.2977988990964896e-07, - "loss": 0.0002, - "reward": 0.5542029142379761, - "reward_std": 0.2829088717699051, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4500362426042557, + "loss": 0.0003, + "reward": 0.7291666865348816, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.2291666716337204, + "rewards/len_reward_func": 0.5, "step": 270 }, { - "completion_length": 44.35416793823242, - "epoch": 4.352, - "grad_norm": 1.4530961035974823, - "kl": 0.164794921875, + "completion_length": 33.12500190734863, + "epoch": 4.304, + "grad_norm": 4.007391098481951, + "kl": 0.32080078125, "learning_rate": 2.28166253126159e-07, - "loss": 0.0002, - "reward": 0.7500000298023224, - "reward_std": 0.33723290264606476, - "rewards/correct_code_reward_func": 0.2916666716337204, - "rewards/len_reward_func": 0.4583333432674408, + "loss": 0.0003, + "reward": 1.0833333730697632, + "reward_std": 0.4173382371664047, + "rewards/correct_code_reward_func": 0.5833333432674408, + "rewards/len_reward_func": 0.5, "step": 271 }, { - "completion_length": 35.89583492279053, - "epoch": 4.368, - "grad_norm": 2.5744805182579, - "kl": 0.38916015625, + "completion_length": 54.770835876464844, + "epoch": 4.32, + "grad_norm": 1.765789657496171, + "kl": 0.4287109375, "learning_rate": 2.2655353244295927e-07, "loss": 0.0004, - "reward": 0.625, - "reward_std": 0.1451837718486786, - "rewards/correct_code_reward_func": 0.125, + "reward": 0.7500000298023224, + "reward_std": 0.08908708393573761, + "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.5, "step": 272 }, { - "completion_length": 47.625, - "epoch": 4.384, - "grad_norm": 0.7364778839875519, - "kl": 0.728515625, + "completion_length": 46.833335876464844, + "epoch": 4.336, + "grad_norm": 3.0926129932371356, + "kl": 0.35546875, "learning_rate": 2.2494179552657974e-07, - "loss": 0.0007, - "reward": 0.7859163284301758, - "reward_std": 0.22225653380155563, - "rewards/correct_code_reward_func": 0.2916666679084301, - "rewards/len_reward_func": 0.494249626994133, + "loss": 0.0004, + "reward": 0.9128472208976746, + "reward_std": 0.2177412360906601, + "rewards/correct_code_reward_func": 0.4166666716337204, + "rewards/len_reward_func": 0.49618056416511536, "step": 273 }, { - "completion_length": 86.39583587646484, - "epoch": 4.4, - "grad_norm": 1.3878386860528988, - "kl": 0.25146484375, + "completion_length": 29.479166984558105, + "epoch": 4.352, + "grad_norm": 9.296614387501771, + "kl": 0.21484375, "learning_rate": 2.233311100022734e-07, - "loss": 0.0003, - "reward": 0.7891799807548523, - "reward_std": 0.29507260024547577, - "rewards/correct_code_reward_func": 0.29166667722165585, - "rewards/len_reward_func": 0.4975132644176483, + "loss": 0.0002, + "reward": 0.875, + "reward_std": 0.31142252683639526, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.5, "step": 274 }, { - "completion_length": 57.020835876464844, - "epoch": 4.416, - "grad_norm": 0.7271938817428542, - "kl": 0.3740234375, + "completion_length": 24.104166984558105, + "epoch": 4.368, + "grad_norm": 4.386312904473957, + "kl": 0.396484375, "learning_rate": 2.2172154345117894e-07, "loss": 0.0004, - "reward": 0.6616936028003693, - "reward_std": 0.22100430727005005, - "rewards/correct_code_reward_func": 0.1875000111758709, - "rewards/len_reward_func": 0.4741935580968857, + "reward": 0.7291666865348816, + "reward_std": 0.22516433894634247, + "rewards/correct_code_reward_func": 0.25, + "rewards/len_reward_func": 0.4791666716337204, "step": 275 }, { - "completion_length": 56.770835876464844, - "epoch": 4.432, - "grad_norm": 0.9442340494359821, - "kl": 0.37890625, + "completion_length": 31.229167938232422, + "epoch": 4.384, + "grad_norm": 9.14988110970696, + "kl": 1.267578125, "learning_rate": 2.2011316340748528e-07, - "loss": 0.0004, - "reward": 0.684626430273056, - "reward_std": 0.4674495756626129, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.4137931168079376, + "loss": 0.0013, + "reward": 0.8958333730697632, + "reward_std": 0.33108004927635193, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.5, "step": 276 }, { - "completion_length": 51.58333396911621, - "epoch": 4.448, - "grad_norm": 0.8906274955583354, - "kl": 0.3564453125, + "completion_length": 47.041666984558105, + "epoch": 4.4, + "grad_norm": 15.08131618344892, + "kl": 4.38720703125, "learning_rate": 2.1850603735559776e-07, - "loss": 0.0004, - "reward": 0.5396991074085236, - "reward_std": 0.26860009878873825, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.45636576414108276, + "loss": 0.0044, + "reward": 0.9583333432674408, + "reward_std": 0.2903675250709057, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.5, "step": 277 }, { - "completion_length": 36.29166793823242, - "epoch": 4.464, - "grad_norm": 2.551400627149655, - "kl": 0.93115234375, + "completion_length": 49.541666984558105, + "epoch": 4.416, + "grad_norm": 4.095080167229849, + "kl": 0.8115234375, "learning_rate": 2.1690023272730678e-07, - "loss": 0.0009, - "reward": 0.8541666865348816, - "reward_std": 0.39140307903289795, - "rewards/correct_code_reward_func": 0.3541666716337204, - "rewards/len_reward_func": 0.5, + "loss": 0.0008, + "reward": 0.625, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.1458333432674408, + "rewards/len_reward_func": 0.4791666716337204, "step": 278 }, { - "completion_length": 54.72916793823242, - "epoch": 4.48, - "grad_norm": 1.006521153889889, - "kl": 0.21044921875, + "completion_length": 25.291666984558105, + "epoch": 4.432, + "grad_norm": 7.954449742072288, + "kl": 0.2734375, "learning_rate": 2.1529581689895836e-07, - "loss": 0.0002, - "reward": 0.5577778220176697, - "reward_std": 0.2672848328948021, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.4744444489479065, + "loss": 0.0003, + "reward": 0.9166666865348816, + "reward_std": 0.36751919239759445, + "rewards/correct_code_reward_func": 0.4166666716337204, + "rewards/len_reward_func": 0.5, "step": 279 }, { - "completion_length": 42.395835876464844, - "epoch": 4.496, - "grad_norm": 0.7084841751136701, - "kl": 0.4697265625, + "completion_length": 31.604166984558105, + "epoch": 4.448, + "grad_norm": 29.8321255682335, + "kl": 21.2265625, "learning_rate": 2.1369285718862748e-07, - "loss": 0.0005, + "loss": 0.0212, "reward": 0.6666666865348816, - "reward_std": 0.2428889200091362, - "rewards/correct_code_reward_func": 0.1875000111758709, - "rewards/len_reward_func": 0.4791666716337204, + "reward_std": 0.17251639068126678, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.5, "step": 280 }, { - "completion_length": 54.00000190734863, - "epoch": 4.5120000000000005, - "grad_norm": 0.9172719642504781, - "kl": 1.244140625, + "completion_length": 19.08333396911621, + "epoch": 4.464, + "grad_norm": 5.637716121004335, + "kl": 1.09765625, "learning_rate": 2.1209142085329298e-07, - "loss": 0.0012, - "reward": 0.8541666865348816, - "reward_std": 0.23144195601344109, - "rewards/correct_code_reward_func": 0.3541666865348816, + "loss": 0.0011, + "reward": 1.1666666865348816, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.6666666865348816, "rewards/len_reward_func": 0.5, "step": 281 }, { - "completion_length": 37.645835876464844, - "epoch": 4.5280000000000005, - "grad_norm": 0.8649839388995086, - "kl": 0.181640625, + "completion_length": 34.37500190734863, + "epoch": 4.48, + "grad_norm": 4861.585798607741, + "kl": 3440.1103515625, "learning_rate": 2.104915750860164e-07, - "loss": 0.0002, - "reward": 0.6843220591545105, - "reward_std": 0.1857653521001339, - "rewards/correct_code_reward_func": 0.2083333395421505, - "rewards/len_reward_func": 0.4759887158870697, + "loss": 3.4448, + "reward": 0.871611475944519, + "reward_std": 0.3660082519054413, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.49661144614219666, "step": 282 }, { - "completion_length": 66.45833587646484, - "epoch": 4.5440000000000005, - "grad_norm": 0.6522878069874, - "kl": 2.056640625, + "completion_length": 27.1875, + "epoch": 4.496, + "grad_norm": 2.5633170180059732, + "kl": 0.3515625, "learning_rate": 2.088933870131218e-07, - "loss": 0.0021, - "reward": 0.6666666865348816, - "reward_std": 0.2630348764359951, - "rewards/correct_code_reward_func": 0.20833333395421505, - "rewards/len_reward_func": 0.4583333432674408, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.25, + "rewards/len_reward_func": 0.5, "step": 283 }, { - "completion_length": 37.62500190734863, - "epoch": 4.5600000000000005, - "grad_norm": 0.8233886562066076, - "kl": 0.59375, + "completion_length": 51.25000190734863, + "epoch": 4.5120000000000005, + "grad_norm": 2.416767668919635, + "kl": 0.4931640625, "learning_rate": 2.072969236913799e-07, - "loss": 0.0006, - "reward": 0.9888699054718018, - "reward_std": 0.24612322449684143, - "rewards/correct_code_reward_func": 0.5000000149011612, - "rewards/len_reward_func": 0.48886987566947937, + "loss": 0.0005, + "reward": 0.8901910185813904, + "reward_std": 0.1639716625213623, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.4943576455116272, "step": 284 }, { - "completion_length": 95.64583587646484, - "epoch": 4.576, - "grad_norm": 0.6103496667756032, - "kl": 0.2861328125, + "completion_length": 19.625, + "epoch": 4.5280000000000005, + "grad_norm": 2.4304450279898893, + "kl": 0.373046875, "learning_rate": 2.0570225210519433e-07, - "loss": 0.0003, - "reward": 0.5596122443675995, - "reward_std": 0.22038332745432854, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.4762788861989975, + "loss": 0.0004, + "reward": 0.791666716337204, + "reward_std": 0.22233543917536736, + "rewards/correct_code_reward_func": 0.29166667722165585, + "rewards/len_reward_func": 0.5, "step": 285 }, { - "completion_length": 72.70833396911621, - "epoch": 4.592, - "grad_norm": 1.891083292736719, - "kl": 0.23193359375, + "completion_length": 30.08333396911621, + "epoch": 4.5440000000000005, + "grad_norm": 6.167179936081702, + "kl": 0.3251953125, "learning_rate": 2.0410943916379097e-07, - "loss": 0.0002, - "reward": 0.5823873281478882, - "reward_std": 0.22501128166913986, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4782206267118454, + "loss": 0.0003, + "reward": 0.9166666865348816, + "reward_std": 0.2903675436973572, + "rewards/correct_code_reward_func": 0.4166666865348816, + "rewards/len_reward_func": 0.5, "step": 286 }, { - "completion_length": 39.83333396911621, - "epoch": 4.608, - "grad_norm": 2.005511526767329, - "kl": 0.29638671875, + "completion_length": 18.354167461395264, + "epoch": 4.5600000000000005, + "grad_norm": 4.914270308521702, + "kl": 1.1083984375, "learning_rate": 2.0251855169841075e-07, - "loss": 0.0003, - "reward": 0.7587064802646637, - "reward_std": 0.35144534707069397, - "rewards/correct_code_reward_func": 0.2708333358168602, - "rewards/len_reward_func": 0.4878731369972229, + "loss": 0.0011, + "reward": 1.0833333730697632, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.5833333432674408, + "rewards/len_reward_func": 0.5, "step": 287 }, { - "completion_length": 58.16666793823242, - "epoch": 4.624, - "grad_norm": 0.7595124585668036, - "kl": 0.333984375, + "completion_length": 73.60416984558105, + "epoch": 4.576, + "grad_norm": 3.3424488078879273, + "kl": 0.28955078125, "learning_rate": 2.0092965645950564e-07, "loss": 0.0003, - "reward": 0.8635696470737457, - "reward_std": 0.22203121334314346, - "rewards/correct_code_reward_func": 0.395833358168602, - "rewards/len_reward_func": 0.46773628890514374, + "reward": 0.6875, + "reward_std": 0.13607725501060486, + "rewards/correct_code_reward_func": 0.1875, + "rewards/len_reward_func": 0.5, "step": 288 }, { - "completion_length": 36.02083396911621, - "epoch": 4.64, - "grad_norm": 1.6351611542716487, - "kl": 0.3857421875, + "completion_length": 39.22916793823242, + "epoch": 4.592, + "grad_norm": 6.32426908664414, + "kl": 0.28271484375, "learning_rate": 1.993428201139375e-07, - "loss": 0.0004, - "reward": 0.8541666865348816, - "reward_std": 0.3584126830101013, - "rewards/correct_code_reward_func": 0.3750000149011612, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.2342708557844162, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.5, "step": 289 }, { - "completion_length": 55.458335876464844, - "epoch": 4.656, - "grad_norm": 0.8926930458457247, - "kl": 0.35546875, + "completion_length": 22.479166984558105, + "epoch": 4.608, + "grad_norm": 38.222771661469466, + "kl": 16.890625, "learning_rate": 1.977581092421812e-07, - "loss": 0.0004, - "reward": 0.8727189898490906, - "reward_std": 0.41859960556030273, + "loss": 0.0169, + "reward": 0.8958333730697632, + "reward_std": 0.08625819534063339, "rewards/correct_code_reward_func": 0.3958333432674408, - "rewards/len_reward_func": 0.4768856465816498, + "rewards/len_reward_func": 0.5, "step": 290 }, { - "completion_length": 49.91666793823242, - "epoch": 4.672, - "grad_norm": 0.6340748007509769, - "kl": 0.16943359375, + "completion_length": 34.979166984558105, + "epoch": 4.624, + "grad_norm": 8.242066863632235, + "kl": 0.9091796875, "learning_rate": 1.9617559033553126e-07, - "loss": 0.0002, - "reward": 0.6335565447807312, - "reward_std": 0.2479529306292534, - "rewards/correct_code_reward_func": 0.1666666679084301, - "rewards/len_reward_func": 0.466889888048172, + "loss": 0.0009, + "reward": 0.8333333730697632, + "reward_std": 0.2342708334326744, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, "step": 291 }, { - "completion_length": 49.50000190734863, - "epoch": 4.688, - "grad_norm": 1.08655909875487, - "kl": 0.4951171875, + "completion_length": 24.541667938232422, + "epoch": 4.64, + "grad_norm": 1.21691590374382, + "kl": 0.23046875, "learning_rate": 1.9459532979331148e-07, - "loss": 0.0005, - "reward": 0.9375000596046448, - "reward_std": 0.204109326004982, - "rewards/correct_code_reward_func": 0.4375000149011612, + "loss": 0.0002, + "reward": 1.0625000596046448, + "reward_std": 0.13607725501060486, + "rewards/correct_code_reward_func": 0.5625000149011612, "rewards/len_reward_func": 0.5, "step": 292 }, { - "completion_length": 69.08333587646484, - "epoch": 4.704, - "grad_norm": 0.5751923430248818, - "kl": 0.39013671875, + "completion_length": 25.27083396911621, + "epoch": 4.656, + "grad_norm": 8.643768890047063, + "kl": 0.443359375, "learning_rate": 1.930173939200892e-07, "loss": 0.0004, - "reward": 0.6483603715896606, - "reward_std": 0.24255751073360443, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.44002701342105865, + "reward": 1.0625000596046448, + "reward_std": 0.2931964099407196, + "rewards/correct_code_reward_func": 0.5625000298023224, + "rewards/len_reward_func": 0.5, "step": 293 }, { - "completion_length": 51.791669845581055, - "epoch": 4.72, - "grad_norm": 1.9932316130751548, - "kl": 7.380859375, + "completion_length": 35.64583396911621, + "epoch": 4.672, + "grad_norm": 2.743333904000121, + "kl": 1.708984375, "learning_rate": 1.9144184892289336e-07, - "loss": 0.0074, - "reward": 0.7171160280704498, - "reward_std": 0.26716797798871994, - "rewards/correct_code_reward_func": 0.22916667722165585, - "rewards/len_reward_func": 0.48794935643672943, + "loss": 0.0017, + "reward": 0.8333333730697632, + "reward_std": 0.17817416787147522, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, "step": 294 }, { - "completion_length": 42.16666793823242, - "epoch": 4.736, - "grad_norm": 2.3966727580891924, - "kl": 0.373046875, + "completion_length": 33.812500953674316, + "epoch": 4.688, + "grad_norm": 6.045139670180444, + "kl": 0.34765625, "learning_rate": 1.8986876090843664e-07, - "loss": 0.0004, - "reward": 1.011071503162384, - "reward_std": 0.46453168988227844, - "rewards/correct_code_reward_func": 0.5416666865348816, - "rewards/len_reward_func": 0.46940475702285767, + "loss": 0.0003, + "reward": 1.0625, + "reward_std": 0.1480126492679119, + "rewards/correct_code_reward_func": 0.5833333432674408, + "rewards/len_reward_func": 0.4791666716337204, "step": 295 }, { - "completion_length": 36.60416793823242, - "epoch": 4.752, - "grad_norm": 1.6181556287216, - "kl": 0.3564453125, + "completion_length": 61.95833492279053, + "epoch": 4.704, + "grad_norm": 3.6627974499384623, + "kl": 0.453125, "learning_rate": 1.882981958803414e-07, - "loss": 0.0004, - "reward": 1.1458333730697632, - "reward_std": 0.3857453167438507, - "rewards/correct_code_reward_func": 0.6458333432674408, - "rewards/len_reward_func": 0.5, + "loss": 0.0005, + "reward": 0.7690277099609375, + "reward_std": 0.13556675985455513, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.4773610234260559, "step": 296 }, { - "completion_length": 71.70833587646484, - "epoch": 4.768, - "grad_norm": 1.1444434754067216, - "kl": 0.37646484375, + "completion_length": 47.00000190734863, + "epoch": 4.72, + "grad_norm": 3.1790839878647574, + "kl": 0.4951171875, "learning_rate": 1.8673021973637093e-07, - "loss": 0.0004, - "reward": 0.7599400877952576, - "reward_std": 0.3331267535686493, - "rewards/correct_code_reward_func": 0.2916666716337204, - "rewards/len_reward_func": 0.46827343106269836, + "loss": 0.0005, + "reward": 0.8541666865348816, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.5, "step": 297 }, { - "completion_length": 63.29166793823242, - "epoch": 4.784, - "grad_norm": 0.9294223033475936, - "kl": 0.369140625, + "completion_length": 34.85416793823242, + "epoch": 4.736, + "grad_norm": 1.8967683625125047, + "kl": 0.4326171875, "learning_rate": 1.8516489826566374e-07, "loss": 0.0004, - "reward": 0.8333333730697632, - "reward_std": 0.33875515311956406, - "rewards/correct_code_reward_func": 0.3333333432674408, + "reward": 1.1041666865348816, + "reward_std": 0.23144195601344109, + "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 298 }, { - "completion_length": 60.75000190734863, - "epoch": 4.8, - "grad_norm": 0.6201098508306943, - "kl": 0.251953125, + "completion_length": 21.89583396911621, + "epoch": 4.752, + "grad_norm": 11.472906917031708, + "kl": 0.3603515625, "learning_rate": 1.8360229714597368e-07, - "loss": 0.0003, - "reward": 0.5748698115348816, - "reward_std": 0.2189413234591484, - "rewards/correct_code_reward_func": 0.1041666679084301, - "rewards/len_reward_func": 0.4707031399011612, + "loss": 0.0004, + "reward": 1.2916666865348816, + "reward_std": 0.22233543917536736, + "rewards/correct_code_reward_func": 0.7916666865348816, + "rewards/len_reward_func": 0.5, "step": 299 }, { - "completion_length": 53.208335876464844, - "epoch": 4.816, - "grad_norm": 0.7178187934543306, - "kl": 0.365234375, - "learning_rate": 1.8204248194091425e-07, - "loss": 0.0004, - "reward": 0.5908489525318146, - "reward_std": 0.2010781541466713, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.4658489376306534, + "completion_length": 47.47916793823242, + "epoch": 4.768, + "grad_norm": 18.376424918444233, + "kl": 4.580078125, + "learning_rate": 1.8204248194091425e-07, + "loss": 0.0045, + "reward": 0.7916666865348816, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.5, "step": 300 }, { - "completion_length": 55.00000190734863, - "epoch": 4.832, - "grad_norm": 0.9187468800336557, - "kl": 0.6103515625, + "completion_length": 56.5, + "epoch": 4.784, + "grad_norm": 7.120288162523622, + "kl": 0.31298828125, "learning_rate": 1.804855180972075e-07, - "loss": 0.0006, - "reward": 0.6250000298023224, - "reward_std": 0.22233545035123825, - "rewards/correct_code_reward_func": 0.1250000037252903, + "loss": 0.0003, + "reward": 0.9375, + "reward_std": 0.23144196718931198, + "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 301 }, { - "completion_length": 64.72916793823242, - "epoch": 4.848, - "grad_norm": 1.290623352128771, - "kl": 0.962890625, + "completion_length": 42.89583396911621, + "epoch": 4.8, + "grad_norm": 4.028419691961419, + "kl": 1.111328125, "learning_rate": 1.7893147094193784e-07, - "loss": 0.001, - "reward": 0.748400866985321, - "reward_std": 0.2990126460790634, - "rewards/correct_code_reward_func": 0.2916666679084301, - "rewards/len_reward_func": 0.45673419535160065, + "loss": 0.0011, + "reward": 0.5833333432674408, + "reward_std": 0.08908708393573761, + "rewards/correct_code_reward_func": 0.0833333358168602, + "rewards/len_reward_func": 0.5, "step": 302 }, { - "completion_length": 46.58333396911621, - "epoch": 4.864, - "grad_norm": 3.169269142776468, - "kl": 0.30810546875, + "completion_length": 39.95833396911621, + "epoch": 4.816, + "grad_norm": 1.911137619069032, + "kl": 0.3046875, "learning_rate": 1.7738040567981165e-07, "loss": 0.0003, - "reward": 0.9166666865348816, - "reward_std": 0.2630349025130272, - "rewards/correct_code_reward_func": 0.4166666865348816, - "rewards/len_reward_func": 0.5, + "reward": 0.8125, + "reward_std": 0.37034808099269867, + "rewards/correct_code_reward_func": 0.3333333358168602, + "rewards/len_reward_func": 0.4791666716337204, "step": 303 }, { - "completion_length": 41.125, - "epoch": 4.88, - "grad_norm": 0.7481447513768535, - "kl": 0.306640625, + "completion_length": 31.854167938232422, + "epoch": 4.832, + "grad_norm": 2.705599015258731, + "kl": 2.70703125, "learning_rate": 1.7583238739042084e-07, - "loss": 0.0003, - "reward": 1.1875000596046448, - "reward_std": 0.35827483236789703, - "rewards/correct_code_reward_func": 0.7083333730697632, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0027, + "reward": 0.6041666865348816, + "reward_std": 0.204109326004982, + "rewards/correct_code_reward_func": 0.1041666679084301, + "rewards/len_reward_func": 0.5, "step": 304 }, { - "completion_length": 105.56250381469727, - "epoch": 4.896, - "grad_norm": 0.8356273750549695, - "kl": 0.16796875, + "completion_length": 66.0416669845581, + "epoch": 4.848, + "grad_norm": 6.944354666378553, + "kl": 1.524169921875, "learning_rate": 1.7428748102551234e-07, - "loss": 0.0002, - "reward": 0.789855107665062, - "reward_std": 0.12297509284690022, - "rewards/correct_code_reward_func": 0.2916666865348816, - "rewards/len_reward_func": 0.49818842113018036, + "loss": 0.0015, + "reward": 0.7218064665794373, + "reward_std": 0.2561583071947098, + "rewards/correct_code_reward_func": 0.2291666679084301, + "rewards/len_reward_func": 0.49263978004455566, "step": 305 }, { - "completion_length": 34.22916793823242, - "epoch": 4.912, - "grad_norm": 0.4106060099057334, - "kl": 0.31640625, + "completion_length": 31.39583396911621, + "epoch": 4.864, + "grad_norm": 0.6849820957521907, + "kl": 0.2724609375, "learning_rate": 1.7274575140626315e-07, "loss": 0.0003, - "reward": 0.8541666865348816, - "reward_std": 0.13607725501060486, - "rewards/correct_code_reward_func": 0.3541666716337204, + "reward": 1.1458333730697632, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.6458333432674408, "rewards/len_reward_func": 0.5, "step": 306 }, { - "completion_length": 47.16666793823242, - "epoch": 4.928, - "grad_norm": 1.6951390244165374, - "kl": 0.984375, + "completion_length": 19.14583396911621, + "epoch": 4.88, + "grad_norm": 21.688892126211925, + "kl": 35.15625, "learning_rate": 1.712072632205604e-07, - "loss": 0.001, - "reward": 0.944968581199646, - "reward_std": 0.5533296465873718, - "rewards/correct_code_reward_func": 0.5000000298023224, - "rewards/len_reward_func": 0.4449685513973236, + "loss": 0.0351, + "reward": 1.2083333730697632, + "reward_std": 0.34018659591674805, + "rewards/correct_code_reward_func": 0.7083333730697632, + "rewards/len_reward_func": 0.5, "step": 307 }, { - "completion_length": 39.60416793823242, - "epoch": 4.944, - "grad_norm": 0.8160832010996655, - "kl": 0.18310546875, + "completion_length": 50.04166793823242, + "epoch": 4.896, + "grad_norm": 0.829370422590923, + "kl": 0.26708984375, "learning_rate": 1.6967208102028696e-07, - "loss": 0.0002, - "reward": 0.7083333432674408, - "reward_std": 0.2553258389234543, - "rewards/correct_code_reward_func": 0.2083333358168602, - "rewards/len_reward_func": 0.5, + "loss": 0.0003, + "reward": 0.9525146782398224, + "reward_std": 0.17141204327344894, + "rewards/correct_code_reward_func": 0.4583333358168602, + "rewards/len_reward_func": 0.4941813200712204, "step": 308 }, { - "completion_length": 52.29166793823242, - "epoch": 4.96, - "grad_norm": 0.6978281365270744, - "kl": 0.5634765625, + "completion_length": 22.791667938232422, + "epoch": 4.912, + "grad_norm": 2.275578259047212, + "kl": 1.4638671875, "learning_rate": 1.6814026921861335e-07, - "loss": 0.0006, - "reward": 0.854166716337204, - "reward_std": 0.23144196718931198, - "rewards/correct_code_reward_func": 0.3958333432674408, - "rewards/len_reward_func": 0.4583333432674408, + "loss": 0.0015, + "reward": 0.8750000298023224, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.5, "step": 309 }, { - "completion_length": 48.4375, - "epoch": 4.976, - "grad_norm": 1.8561890601134186, - "kl": 0.890625, + "completion_length": 31.604166984558105, + "epoch": 4.928, + "grad_norm": 6.885099376296929, + "kl": 2.74658203125, "learning_rate": 1.6661189208729489e-07, - "loss": 0.0009, - "reward": 0.7916666865348816, - "reward_std": 0.2994871288537979, - "rewards/correct_code_reward_func": 0.2916666716337204, + "loss": 0.0027, + "reward": 1.2083333432674408, + "reward_std": 0.24966806173324585, + "rewards/correct_code_reward_func": 0.7083333432674408, "rewards/len_reward_func": 0.5, "step": 310 }, { - "completion_length": 48.1875, - "epoch": 4.992, - "grad_norm": 0.7979733914386165, - "kl": 0.220703125, + "completion_length": 29.666667938232422, + "epoch": 4.944, + "grad_norm": 6.096335936871119, + "kl": 0.33837890625, "learning_rate": 1.6508701375397486e-07, - "loss": 0.0002, - "reward": 0.7265486121177673, - "reward_std": 0.30873851478099823, - "rewards/correct_code_reward_func": 0.2500000074505806, - "rewards/len_reward_func": 0.47654858231544495, + "loss": 0.0003, + "reward": 0.916666716337204, + "reward_std": 0.45660628378391266, + "rewards/correct_code_reward_func": 0.4166666865348816, + "rewards/len_reward_func": 0.5, "step": 311 }, { - "completion_length": 34.75, - "epoch": 5.0, - "grad_norm": 0.7979733914386165, - "kl": 0.1943359375, + "completion_length": 40.10416793823242, + "epoch": 4.96, + "grad_norm": 12.76661113809146, + "kl": 0.421875, "learning_rate": 1.6356569819949427e-07, - "loss": 0.0001, - "reward": 1.125, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.625, + "loss": 0.0004, + "reward": 1.0625000596046448, + "reward_std": 0.30231601744890213, + "rewards/correct_code_reward_func": 0.5625000298023224, "rewards/len_reward_func": 0.5, "step": 312 }, { - "completion_length": 43.56250190734863, - "epoch": 5.016, - "grad_norm": 1.1867164558898395, - "kl": 0.32373046875, + "completion_length": 44.64583492279053, + "epoch": 4.976, + "grad_norm": 2.1774262299991034, + "kl": 1.03515625, "learning_rate": 1.6204800925520685e-07, - "loss": 0.0003, - "reward": 0.8333333432674408, - "reward_std": 0.2357022576034069, - "rewards/correct_code_reward_func": 0.33333333395421505, + "loss": 0.001, + "reward": 0.8125, + "reward_std": 0.28126102685928345, + "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.5, "step": 313 }, { - "completion_length": 58.791669845581055, - "epoch": 5.032, - "grad_norm": 0.6930691781141401, - "kl": 0.46875, + "completion_length": 32.479166984558105, + "epoch": 4.992, + "grad_norm": 1.1114158577515365, + "kl": 0.85546875, "learning_rate": 1.6053401060030097e-07, - "loss": 0.0005, - "reward": 0.9166666865348816, - "reward_std": 0.22233545035123825, - "rewards/correct_code_reward_func": 0.4166666716337204, + "loss": 0.0009, + "reward": 0.854166716337204, + "reward_std": 0.175345279276371, + "rewards/correct_code_reward_func": 0.354166679084301, "rewards/len_reward_func": 0.5, "step": 314 }, { - "completion_length": 55.39583396911621, - "epoch": 5.048, - "grad_norm": 2.482740908269578, - "kl": 0.83935546875, + "completion_length": 23.95833396911621, + "epoch": 5.0, + "grad_norm": 1.1114158577515365, + "kl": 0.486328125, "learning_rate": 1.5902376575912814e-07, - "loss": 0.0008, - "reward": 0.7500000298023224, - "reward_std": 0.28408990427851677, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0002, + "reward": 1.375, + "reward_std": 0.17251639068126678, + "rewards/correct_code_reward_func": 0.875, + "rewards/len_reward_func": 0.5, "step": 315 }, { - "completion_length": 70.56250381469727, - "epoch": 5.064, - "grad_norm": 0.7517797188478601, - "kl": 0.193359375, + "completion_length": 27.729167938232422, + "epoch": 5.016, + "grad_norm": 3.4863048431802666, + "kl": 0.4541015625, "learning_rate": 1.57517338098537e-07, - "loss": 0.0002, - "reward": 0.6666666865348816, - "reward_std": 0.2994871288537979, - "rewards/correct_code_reward_func": 0.1666666716337204, + "loss": 0.0005, + "reward": 0.9583333730697632, + "reward_std": 0.22233543917536736, + "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.5, "step": 316 }, { - "completion_length": 47.12500190734863, - "epoch": 5.08, - "grad_norm": 0.8728604480780374, - "kl": 0.341796875, + "completion_length": 45.79166793823242, + "epoch": 5.032, + "grad_norm": 2.9450234856850557, + "kl": 0.48193359375, "learning_rate": 1.5601479082521525e-07, - "loss": 0.0003, - "reward": 0.6410502195358276, - "reward_std": 0.2793925926089287, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.49521684646606445, + "loss": 0.0005, + "reward": 0.8541666865348816, + "reward_std": 0.25392838567495346, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.5, "step": 317 }, { - "completion_length": 63.562503814697266, - "epoch": 5.096, - "grad_norm": 0.7176660807158549, - "kl": 0.6240234375, + "completion_length": 37.645835876464844, + "epoch": 5.048, + "grad_norm": 2.365992710448499, + "kl": 0.2646484375, "learning_rate": 1.545161869830371e-07, - "loss": 0.0006, - "reward": 0.7274567484855652, - "reward_std": 0.19909487664699554, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.477456733584404, + "loss": 0.0003, + "reward": 0.9583333730697632, + "reward_std": 0.2840898931026459, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.5, "step": 318 }, { - "completion_length": 81.29166793823242, - "epoch": 5.112, - "grad_norm": 2.417010408616767, - "kl": 0.6142578125, + "completion_length": 56.79166793823242, + "epoch": 5.064, + "grad_norm": 1.7294710190023255, + "kl": 0.18359375, "learning_rate": 1.5302158945041837e-07, - "loss": 0.0006, - "reward": 0.8533105254173279, - "reward_std": 0.2956180199980736, - "rewards/correct_code_reward_func": 0.3750000149011612, - "rewards/len_reward_func": 0.4783105105161667, + "loss": 0.0002, + "reward": 0.7899168729782104, + "reward_std": 0.22728458046913147, + "rewards/correct_code_reward_func": 0.2916666679084301, + "rewards/len_reward_func": 0.49825021624565125, "step": 319 }, { - "completion_length": 78.93750381469727, - "epoch": 5.128, - "grad_norm": 1.001894790132369, - "kl": 0.197265625, + "completion_length": 20.89583396911621, + "epoch": 5.08, + "grad_norm": 16.06787787034615, + "kl": 0.431640625, "learning_rate": 1.5153106093767825e-07, - "loss": 0.0002, - "reward": 1.014756977558136, - "reward_std": 0.3837348073720932, - "rewards/correct_code_reward_func": 0.5625000149011612, - "rewards/len_reward_func": 0.4522569477558136, + "loss": 0.0004, + "reward": 0.6458333432674408, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.1458333432674408, + "rewards/len_reward_func": 0.5, "step": 320 }, { - "completion_length": 38.31250190734863, - "epoch": 5.144, - "grad_norm": 0.7375444997883743, - "kl": 0.576171875, + "completion_length": 37.72916793823242, + "epoch": 5.096, + "grad_norm": 4.822046610306533, + "kl": 0.375, "learning_rate": 1.5004466398440773e-07, - "loss": 0.0006, - "reward": 0.7500000298023224, - "reward_std": 0.22233543917536736, - "rewards/correct_code_reward_func": 0.25000000558793545, - "rewards/len_reward_func": 0.5, + "loss": 0.0004, + "reward": 0.916666716337204, + "reward_std": 0.19500279426574707, + "rewards/correct_code_reward_func": 0.4375000149011612, + "rewards/len_reward_func": 0.4791666716337204, "step": 321 }, { - "completion_length": 79.04166793823242, - "epoch": 5.16, - "grad_norm": 1.081610905786561, - "kl": 1.6796875, + "completion_length": 67.45833587646484, + "epoch": 5.112, + "grad_norm": 4.887246928042618, + "kl": 0.29296875, "learning_rate": 1.4856246095684622e-07, - "loss": 0.0017, - "reward": 0.5750000327825546, - "reward_std": 0.2145303450524807, - "rewards/correct_code_reward_func": 0.1041666716337204, - "rewards/len_reward_func": 0.47083334624767303, + "loss": 0.0003, + "reward": 0.8657760918140411, + "reward_std": 0.26504893600940704, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.46994274854660034, "step": 322 }, { - "completion_length": 54.79166793823242, - "epoch": 5.176, - "grad_norm": 1.2225974244848414, - "kl": 0.27734375, + "completion_length": 49.12500190734863, + "epoch": 5.128, + "grad_norm": 2.807742783582148, + "kl": 0.2431640625, "learning_rate": 1.4708451404526407e-07, - "loss": 0.0003, - "reward": 0.7585616707801819, - "reward_std": 0.2388189136981964, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.4877283275127411, + "loss": 0.0002, + "reward": 1.0833333730697632, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.583333358168602, + "rewards/len_reward_func": 0.5, "step": 323 }, { - "completion_length": 61.437503814697266, - "epoch": 5.192, - "grad_norm": 0.769812175072725, - "kl": 0.29638671875, + "completion_length": 19.916667938232422, + "epoch": 5.144, + "grad_norm": 6.084536093056242, + "kl": 1.8671875, "learning_rate": 1.4561088526135374e-07, - "loss": 0.0003, - "reward": 0.7666933834552765, - "reward_std": 0.3699685037136078, - "rewards/correct_code_reward_func": 0.2916666865348816, - "rewards/len_reward_func": 0.4750267118215561, + "loss": 0.0019, + "reward": 0.75, + "reward_std": 0.1451837718486786, + "rewards/correct_code_reward_func": 0.25, + "rewards/len_reward_func": 0.5, "step": 324 }, { - "completion_length": 45.60416793823242, - "epoch": 5.208, - "grad_norm": 0.7219664099291901, - "kl": 0.3095703125, + "completion_length": 54.52083396911621, + "epoch": 5.16, + "grad_norm": 5.422926804352802, + "kl": 4.15625, "learning_rate": 1.4414163643562753e-07, - "loss": 0.0003, - "reward": 0.875, - "reward_std": 0.2903675436973572, - "rewards/correct_code_reward_func": 0.375, + "loss": 0.0042, + "reward": 0.5833333432674408, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.08333333395421505, "rewards/len_reward_func": 0.5, "step": 325 }, { - "completion_length": 48.16666793823242, - "epoch": 5.224, - "grad_norm": 0.6178093397003758, - "kl": 0.2646484375, + "completion_length": 33.04166793823242, + "epoch": 5.176, + "grad_norm": 5.605984792393882, + "kl": 0.18505859375, "learning_rate": 1.4267682921482356e-07, - "loss": 0.0003, - "reward": 0.5625000149011612, - "reward_std": 0.2041093334555626, - "rewards/correct_code_reward_func": 0.0833333358168602, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0002, + "reward": 1.0625000596046448, + "reward_std": 0.2658637687563896, + "rewards/correct_code_reward_func": 0.5625000149011612, + "rewards/len_reward_func": 0.5, "step": 326 }, { - "completion_length": 53.83333396911621, - "epoch": 5.24, - "grad_norm": 0.7976835771132705, - "kl": 0.58447265625, + "completion_length": 34.56250190734863, + "epoch": 5.192, + "grad_norm": 11.103407207018604, + "kl": 0.2412109375, "learning_rate": 1.4121652505931918e-07, - "loss": 0.0006, + "loss": 0.0002, "reward": 0.8541666865348816, - "reward_std": 0.25392838567495346, + "reward_std": 0.33592626452445984, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.5, "step": 327 }, { - "completion_length": 59.000003814697266, - "epoch": 5.256, - "grad_norm": 0.6465131606447013, - "kl": 0.302734375, + "completion_length": 34.89583492279053, + "epoch": 5.208, + "grad_norm": 5.289761368392477, + "kl": 1.3271484375, "learning_rate": 1.3976078524055203e-07, - "loss": 0.0003, - "reward": 1.0416666865348816, - "reward_std": 0.3247893303632736, - "rewards/correct_code_reward_func": 0.5416666716337204, - "rewards/len_reward_func": 0.5, + "loss": 0.0013, + "reward": 0.9074198007583618, + "reward_std": 0.31024400889873505, + "rewards/correct_code_reward_func": 0.4166666716337204, + "rewards/len_reward_func": 0.4907531142234802, "step": 328 }, - { - "completion_length": 79.77083587646484, - "epoch": 5.272, - "grad_norm": 1.1872057595478824, - "kl": 0.21728515625, + { + "completion_length": 42.91666793823242, + "epoch": 5.224, + "grad_norm": 4.462596101311776, + "kl": 1.146484375, "learning_rate": 1.383096708384494e-07, - "loss": 0.0002, - "reward": 0.788268655538559, - "reward_std": 0.2649368643760681, - "rewards/correct_code_reward_func": 0.291666679084301, - "rewards/len_reward_func": 0.49660199880599976, + "loss": 0.0011, + "reward": 0.6458333432674408, + "reward_std": 0.1753452718257904, + "rewards/correct_code_reward_func": 0.1458333432674408, + "rewards/len_reward_func": 0.5, "step": 329 }, { - "completion_length": 46.04166793823242, - "epoch": 5.288, - "grad_norm": 1.0262222411098834, - "kl": 0.4150390625, + "completion_length": 35.52083492279053, + "epoch": 5.24, + "grad_norm": 1.0563895878531222, + "kl": 0.5283203125, "learning_rate": 1.3686324273886528e-07, - "loss": 0.0004, - "reward": 0.9375000298023224, - "reward_std": 0.3584126681089401, - "rewards/correct_code_reward_func": 0.4375000149011612, + "loss": 0.0005, + "reward": 0.9791666865348816, + "reward_std": 0.175345279276371, + "rewards/correct_code_reward_func": 0.4791666716337204, "rewards/len_reward_func": 0.5, "step": 330 }, { - "completion_length": 66.25000381469727, - "epoch": 5.304, - "grad_norm": 0.9039193450207341, - "kl": 2.11328125, + "completion_length": 37.64583396911621, + "epoch": 5.256, + "grad_norm": 5.06525759682497, + "kl": 0.2841796875, "learning_rate": 1.354215616310258e-07, - "loss": 0.0021, - "reward": 0.5625000298023224, - "reward_std": 0.13607724383473396, - "rewards/correct_code_reward_func": 0.06250000186264515, + "loss": 0.0003, + "reward": 1.1250000596046448, + "reward_std": 0.19500280916690826, + "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 331 }, { - "completion_length": 54.354169845581055, - "epoch": 5.32, - "grad_norm": 0.5415314280458939, - "kl": 0.2373046875, + "completion_length": 38.58333396911621, + "epoch": 5.272, + "grad_norm": 9.911669806473292, + "kl": 0.388671875, "learning_rate": 1.339846880049829e-07, - "loss": 0.0002, - "reward": 0.942660003900528, - "reward_std": 0.282895028591156, - "rewards/correct_code_reward_func": 0.4583333358168602, - "rewards/len_reward_func": 0.48432666063308716, + "loss": 0.0004, + "reward": 0.9583333730697632, + "reward_std": 0.2357022613286972, + "rewards/correct_code_reward_func": 0.458333358168602, + "rewards/len_reward_func": 0.5, "step": 332 }, { - "completion_length": 51.625, - "epoch": 5.336, - "grad_norm": 0.8036004258808812, - "kl": 0.24755859375, + "completion_length": 28.437501907348633, + "epoch": 5.288, + "grad_norm": 3.57323242715857, + "kl": 1.20703125, "learning_rate": 1.325526821490761e-07, - "loss": 0.0002, - "reward": 0.6974638104438782, - "reward_std": 0.10789545625448227, - "rewards/correct_code_reward_func": 0.2083333395421505, - "rewards/len_reward_func": 0.489130437374115, + "loss": 0.0012, + "reward": 0.8958333432674408, + "reward_std": 0.2041093371808529, + "rewards/correct_code_reward_func": 0.3958333432674408, + "rewards/len_reward_func": 0.5, "step": 333 }, { - "completion_length": 53.97916793823242, - "epoch": 5.352, - "grad_norm": 1.2631560196521783, - "kl": 0.3173828125, + "completion_length": 49.60416793823242, + "epoch": 5.304, + "grad_norm": 6.351617997400021, + "kl": 0.169921875, "learning_rate": 1.3112560414740313e-07, - "loss": 0.0003, - "reward": 0.8738109171390533, - "reward_std": 0.28745321929454803, - "rewards/correct_code_reward_func": 0.3750000149011612, - "rewards/len_reward_func": 0.49881088733673096, + "loss": 0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.24339044094085693, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.5, "step": 334 }, { - "completion_length": 62.270835876464844, - "epoch": 5.368, - "grad_norm": 0.7683313980515705, - "kl": 0.3125, + "completion_length": 27.89583396911621, + "epoch": 5.32, + "grad_norm": 1.227821200402909, + "kl": 0.3876953125, "learning_rate": 1.2970351387729872e-07, - "loss": 0.0003, - "reward": 1.0733543634414673, - "reward_std": 0.3076964318752289, - "rewards/correct_code_reward_func": 0.583333358168602, - "rewards/len_reward_func": 0.4900210201740265, + "loss": 0.0004, + "reward": 1.1250000596046448, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.6250000149011612, + "rewards/len_reward_func": 0.5, "step": 335 }, { - "completion_length": 65.70833587646484, - "epoch": 5.384, - "grad_norm": 0.6607582037952944, - "kl": 0.83056640625, + "completion_length": 35.479166984558105, + "epoch": 5.336, + "grad_norm": 6.4331146800412515, + "kl": 0.58984375, "learning_rate": 1.2828647100682261e-07, - "loss": 0.0008, - "reward": 0.8539415895938873, - "reward_std": 0.2910173237323761, - "rewards/correct_code_reward_func": 0.375, - "rewards/len_reward_func": 0.47894157469272614, + "loss": 0.0006, + "reward": 0.8541666865348816, + "reward_std": 0.13607725501060486, + "rewards/correct_code_reward_func": 0.3541666716337204, + "rewards/len_reward_func": 0.5, "step": 336 }, { - "completion_length": 50.333335876464844, - "epoch": 5.4, - "grad_norm": 1.4202119527945205, - "kl": 0.29052734375, + "completion_length": 36.41666793823242, + "epoch": 5.352, + "grad_norm": 403.82773226092394, + "kl": 451.5, "learning_rate": 1.2687453499225546e-07, - "loss": 0.0003, - "reward": 0.6875000298023224, - "reward_std": 0.21322892233729362, - "rewards/correct_code_reward_func": 0.18750000558793545, - "rewards/len_reward_func": 0.5, + "loss": 0.4527, + "reward": 0.9583333432674408, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.4791666865348816, + "rewards/len_reward_func": 0.4791666716337204, "step": 337 }, { - "completion_length": 70.83333587646484, - "epoch": 5.416, - "grad_norm": 0.671227545542802, - "kl": 0.2939453125, + "completion_length": 41.0, + "epoch": 5.368, + "grad_norm": 108.94594158857709, + "kl": 0.30615234375, "learning_rate": 1.2546776507560467e-07, "loss": 0.0003, - "reward": 1.1016666889190674, - "reward_std": 0.2055267058312893, - "rewards/correct_code_reward_func": 0.6041666865348816, - "rewards/len_reward_func": 0.4975000023841858, + "reward": 1.0833333730697632, + "reward_std": 0.376638799905777, + "rewards/correct_code_reward_func": 0.5833333432674408, + "rewards/len_reward_func": 0.5, "step": 338 }, { - "completion_length": 69.5, - "epoch": 5.432, - "grad_norm": 0.6490408285446165, - "kl": 0.298828125, + "completion_length": 43.87500190734863, + "epoch": 5.384, + "grad_norm": 5.008994119057386, + "kl": 0.23828125, "learning_rate": 1.2406622028211843e-07, - "loss": 0.0003, - "reward": 0.6640508472919464, - "reward_std": 0.3461538702249527, - "rewards/correct_code_reward_func": 0.1875000074505806, - "rewards/len_reward_func": 0.476550817489624, + "loss": 0.0002, + "reward": 0.8333333432674408, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.3541666716337204, + "rewards/len_reward_func": 0.4791666716337204, "step": 339 }, { - "completion_length": 68.1875, - "epoch": 5.448, - "grad_norm": 0.6222490159827438, - "kl": 0.412109375, + "completion_length": 27.58333396911621, + "epoch": 5.4, + "grad_norm": 5.089868304364812, + "kl": 0.3525390625, "learning_rate": 1.2266995941780933e-07, "loss": 0.0004, - "reward": 0.8004283905029297, - "reward_std": 0.2547912299633026, - "rewards/correct_code_reward_func": 0.3125000074505806, - "rewards/len_reward_func": 0.4879283607006073, + "reward": 0.8333333432674408, + "reward_std": 0.2342708557844162, + "rewards/correct_code_reward_func": 0.3333333358168602, + "rewards/len_reward_func": 0.5, "step": 340 }, { - "completion_length": 56.27083396911621, - "epoch": 5.464, - "grad_norm": 1.0219104011786817, - "kl": 0.4951171875, + "completion_length": 33.333335876464844, + "epoch": 5.416, + "grad_norm": 5.28992518624715, + "kl": 0.41015625, "learning_rate": 1.2127904106698665e-07, - "loss": 0.0005, - "reward": 0.7083333432674408, - "reward_std": 0.19500282034277916, - "rewards/correct_code_reward_func": 0.2291666679084301, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0004, + "reward": 1.1875000596046448, + "reward_std": 0.3310800790786743, + "rewards/correct_code_reward_func": 0.6875000298023224, + "rewards/len_reward_func": 0.5, "step": 341 }, { - "completion_length": 38.12500190734863, - "epoch": 5.48, - "grad_norm": 0.9860315140229468, - "kl": 0.3369140625, + "completion_length": 53.68750286102295, + "epoch": 5.432, + "grad_norm": 6.798840307980019, + "kl": 0.478515625, "learning_rate": 1.1989352358979888e-07, - "loss": 0.0003, - "reward": 0.9791666865348816, - "reward_std": 0.0589255653321743, - "rewards/correct_code_reward_func": 0.4791666716337204, - "rewards/len_reward_func": 0.5, + "loss": 0.0005, + "reward": 0.8114994466304779, + "reward_std": 0.39734339714050293, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.4781661033630371, "step": 342 }, { - "completion_length": 42.41666793823242, - "epoch": 5.496, - "grad_norm": 0.604639948655544, - "kl": 0.314453125, + "completion_length": 46.29166793823242, + "epoch": 5.448, + "grad_norm": 2.7139372433900153, + "kl": 0.326171875, "learning_rate": 1.1851346511978424e-07, "loss": 0.0003, - "reward": 1.0208333730697632, - "reward_std": 0.25392835959792137, - "rewards/correct_code_reward_func": 0.5416666865348816, - "rewards/len_reward_func": 0.4791666716337204, + "reward": 0.8063492178916931, + "reward_std": 0.15347431600093842, + "rewards/correct_code_reward_func": 0.3125000149011612, + "rewards/len_reward_func": 0.4938492029905319, "step": 343 }, { - "completion_length": 123.3125, - "epoch": 5.5120000000000005, - "grad_norm": 0.5749882327850088, - "kl": 1.2080078125, + "completion_length": 32.937500953674316, + "epoch": 5.464, + "grad_norm": 12.07849541059564, + "kl": 0.2080078125, "learning_rate": 1.1713892356143238e-07, - "loss": 0.0012, - "reward": 0.6666666865348816, - "reward_std": 0.2342708334326744, - "rewards/correct_code_reward_func": 0.1666666716337204, + "loss": 0.0002, + "reward": 0.8125000298023224, + "reward_std": 0.22516433894634247, + "rewards/correct_code_reward_func": 0.3125000074505806, "rewards/len_reward_func": 0.5, "step": 344 }, { - "completion_length": 70.64583587646484, - "epoch": 5.5280000000000005, - "grad_norm": 0.6097710691987239, - "kl": 0.251953125, + "completion_length": 25.104166984558105, + "epoch": 5.48, + "grad_norm": 19.666094571207797, + "kl": 0.6865234375, "learning_rate": 1.1576995658775404e-07, - "loss": 0.0003, - "reward": 0.9583333432674408, - "reward_std": 0.36751921474933624, - "rewards/correct_code_reward_func": 0.4583333432674408, + "loss": 0.0007, + "reward": 0.916666716337204, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.5, "step": 345 }, { - "completion_length": 42.50000190734863, - "epoch": 5.5440000000000005, - "grad_norm": 0.9560818449812926, - "kl": 0.578125, + "completion_length": 22.437500953674316, + "epoch": 5.496, + "grad_norm": 7.745046511847673, + "kl": 4.3095703125, "learning_rate": 1.1440662163786166e-07, - "loss": 0.0006, - "reward": 1.0208333730697632, - "reward_std": 0.30231600999832153, - "rewards/correct_code_reward_func": 0.5208333432674408, + "loss": 0.0043, + "reward": 1.2083333730697632, + "reward_std": 0.2903675250709057, + "rewards/correct_code_reward_func": 0.708333358168602, "rewards/len_reward_func": 0.5, "step": 346 }, { - "completion_length": 62.22916793823242, - "epoch": 5.5600000000000005, - "grad_norm": 0.5566570160342841, - "kl": 0.619873046875, + "completion_length": 68.66666793823242, + "epoch": 5.5120000000000005, + "grad_norm": 1.9069458533343282, + "kl": 0.35986328125, "learning_rate": 1.1304897591455928e-07, - "loss": 0.0006, - "reward": 0.7708333730697632, - "reward_std": 0.2903806045651436, - "rewards/correct_code_reward_func": 0.2708333395421505, - "rewards/len_reward_func": 0.5, + "loss": 0.0004, + "reward": 0.8611658215522766, + "reward_std": 0.3323476314544678, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.4861658066511154, "step": 347 }, { - "completion_length": 59.54166793823242, - "epoch": 5.576, - "grad_norm": 10.238580129462242, - "kl": 1.056640625, + "completion_length": 56.56250190734863, + "epoch": 5.5280000000000005, + "grad_norm": 11.309145499622288, + "kl": 2.7958984375, "learning_rate": 1.1169707638194237e-07, - "loss": 0.0011, - "reward": 0.9679256975650787, - "reward_std": 0.3028605878353119, - "rewards/correct_code_reward_func": 0.4791666865348816, - "rewards/len_reward_func": 0.48875901103019714, + "loss": 0.0028, + "reward": 0.9375000596046448, + "reward_std": 0.24056155234575272, + "rewards/correct_code_reward_func": 0.4375000149011612, + "rewards/len_reward_func": 0.5, "step": 348 }, { - "completion_length": 72.79166793823242, - "epoch": 5.592, - "grad_norm": 1.3851306805475117, - "kl": 0.3427734375, + "completion_length": 19.687500953674316, + "epoch": 5.5440000000000005, + "grad_norm": 4.48965441839464, + "kl": 0.3076171875, "learning_rate": 1.103509797630077e-07, "loss": 0.0003, - "reward": 0.7083333432674408, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.20833333395421505, + "reward": 1.1041667461395264, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 349 }, { - "completion_length": 41.72916793823242, - "epoch": 5.608, - "grad_norm": 0.5416470450569314, - "kl": 0.4501953125, + "completion_length": 48.33333396911621, + "epoch": 5.5600000000000005, + "grad_norm": 0.03083945474392255, + "kl": 24.33740234375, "learning_rate": 1.0901074253727336e-07, - "loss": 0.0004, - "reward": 0.6041666865348816, - "reward_std": 0.2998815253376961, - "rewards/correct_code_reward_func": 0.1458333395421505, - "rewards/len_reward_func": 0.4583333432674408, + "loss": 0.0243, + "reward": 0.7916666865348816, + "reward_std": 0.24966806173324585, + "rewards/correct_code_reward_func": 0.2916666865348816, + "rewards/len_reward_func": 0.5, "step": 350 }, { - "completion_length": 45.54166793823242, - "epoch": 5.624, - "grad_norm": 1.8102596792224883, - "kl": 0.3388671875, + "completion_length": 32.27083492279053, + "epoch": 5.576, + "grad_norm": 19.443719364276035, + "kl": 0.41796875, "learning_rate": 1.0767642093840932e-07, - "loss": 0.0003, - "reward": 0.9583333432674408, - "reward_std": 0.22233543917536736, - "rewards/correct_code_reward_func": 0.4583333432674408, - "rewards/len_reward_func": 0.5, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.520833333954215, + "rewards/len_reward_func": 0.4791666716337204, "step": 351 }, { - "completion_length": 41.1875, - "epoch": 5.64, - "grad_norm": 0.9307078643543488, - "kl": 0.630859375, + "completion_length": 55.39583396911621, + "epoch": 5.592, + "grad_norm": 3.0676472737509886, + "kl": 0.328125, "learning_rate": 1.0634807095187737e-07, - "loss": 0.0006, - "reward": 1.0208333432674408, - "reward_std": 0.28126102685928345, - "rewards/correct_code_reward_func": 0.5208333432674408, + "loss": 0.0003, + "reward": 0.7708333432674408, + "reward_std": 0.25392837077379227, + "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 352 }, { - "completion_length": 98.77083396911621, - "epoch": 5.656, - "grad_norm": 0.9009026898209829, - "kl": 0.23828125, + "completion_length": 26.125000953674316, + "epoch": 5.608, + "grad_norm": 2.15117359028803, + "kl": 0.3544921875, "learning_rate": 1.0502574831258257e-07, - "loss": 0.0002, - "reward": 0.6663535237312317, - "reward_std": 0.15518909692764282, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.4996868371963501, + "loss": 0.0004, + "reward": 0.8958333730697632, + "reward_std": 0.23144196718931198, + "rewards/correct_code_reward_func": 0.395833358168602, + "rewards/len_reward_func": 0.5, "step": 353 }, { - "completion_length": 46.60416793823242, - "epoch": 5.672, - "grad_norm": 0.5134917708977882, - "kl": 0.29931640625, + "completion_length": 26.729167938232422, + "epoch": 5.624, + "grad_norm": 2.1508382760067555, + "kl": 0.99462890625, "learning_rate": 1.0370950850253449e-07, - "loss": 0.0003, - "reward": 0.9583333730697632, - "reward_std": 0.36124157905578613, - "rewards/correct_code_reward_func": 0.4583333432674408, + "loss": 0.001, + "reward": 1.1666666865348816, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.6666666865348816, "rewards/len_reward_func": 0.5, "step": 354 }, { - "completion_length": 64.14583587646484, - "epoch": 5.688, - "grad_norm": 1.0342769156720488, - "kl": 0.474609375, + "completion_length": 25.166666984558105, + "epoch": 5.64, + "grad_norm": 1.609591950875138, + "kl": 0.26611328125, "learning_rate": 1.0239940674851941e-07, - "loss": 0.0005, - "reward": 0.734243243932724, - "reward_std": 0.3072577565908432, - "rewards/correct_code_reward_func": 0.2500000111758709, - "rewards/len_reward_func": 0.484243243932724, + "loss": 0.0003, + "reward": 1.041666716337204, + "reward_std": 0.24966806918382645, + "rewards/correct_code_reward_func": 0.5416666865348816, + "rewards/len_reward_func": 0.5, "step": 355 }, { - "completion_length": 48.770835876464844, - "epoch": 5.704, - "grad_norm": 0.9963455906140943, - "kl": 0.43359375, + "completion_length": 72.22916793823242, + "epoch": 5.656, + "grad_norm": 2.9309105849693218, + "kl": 0.1826171875, "learning_rate": 1.0109549801978304e-07, - "loss": 0.0004, - "reward": 1.1666666865348816, - "reward_std": 0.2721545100212097, - "rewards/correct_code_reward_func": 0.6666666716337204, + "loss": 0.0002, + "reward": 0.625, + "reward_std": 0.16623875498771667, + "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.5, "step": 356 }, { - "completion_length": 36.04166793823242, - "epoch": 5.72, - "grad_norm": 4.6092221952278845, - "kl": 0.7607421875, - "learning_rate": 9.979783702572411e-08, - "loss": 0.0008, - "reward": 1.0833333730697632, - "reward_std": 0.379454605281353, - "rewards/correct_code_reward_func": 0.5833333730697632, + "completion_length": 45.062500953674316, + "epoch": 5.672, + "grad_norm": 4.139961399350185, + "kl": 0.486328125, + "learning_rate": 9.979783702572411e-08, + "loss": 0.0005, + "reward": 1.1041666865348816, + "reward_std": 0.5021650195121765, + "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 357 }, { - "completion_length": 48.91666793823242, - "epoch": 5.736, - "grad_norm": 0.582554266309863, - "kl": 0.3466796875, + "completion_length": 51.70833396911621, + "epoch": 5.688, + "grad_norm": 3.6459872360115355, + "kl": 0.19677734375, "learning_rate": 9.850647821359917e-08, - "loss": 0.0003, - "reward": 0.6666666865348816, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.16666667722165585, - "rewards/len_reward_func": 0.5, + "loss": 0.0002, + "reward": 0.875, + "reward_std": 0.31142252683639526, + "rewards/correct_code_reward_func": 0.3958333358168602, + "rewards/len_reward_func": 0.4791666716337204, "step": 358 }, { - "completion_length": 69.31250190734863, - "epoch": 5.752, - "grad_norm": 0.34328751966585425, - "kl": 0.3115234375, + "completion_length": 23.02083396911621, + "epoch": 5.704, + "grad_norm": 8.882333735346968, + "kl": 0.27685546875, "learning_rate": 9.722147576623744e-08, "loss": 0.0003, - "reward": 0.8272929191589355, - "reward_std": 0.09869782626628876, - "rewards/correct_code_reward_func": 0.3541666716337204, - "rewards/len_reward_func": 0.47312623262405396, + "reward": 1.2291666865348816, + "reward_std": 0.1480126492679119, + "rewards/correct_code_reward_func": 0.7291666865348816, + "rewards/len_reward_func": 0.5, "step": 359 }, { - "completion_length": 49.04166793823242, - "epoch": 5.768, - "grad_norm": 1.1485420986178878, - "kl": 0.482421875, + "completion_length": 22.500000953674316, + "epoch": 5.72, + "grad_norm": 1.769657123945252, + "kl": 0.3662109375, "learning_rate": 9.594288359976815e-08, - "loss": 0.0005, - "reward": 0.7708333432674408, - "reward_std": 0.3190068006515503, - "rewards/correct_code_reward_func": 0.2916666716337204, + "loss": 0.0004, + "reward": 1.1875, + "reward_std": 0.23144196718931198, + "rewards/correct_code_reward_func": 0.7083333432674408, "rewards/len_reward_func": 0.4791666716337204, "step": 360 }, { - "completion_length": 48.25000190734863, - "epoch": 5.784, - "grad_norm": 0.8072704072425921, - "kl": 0.6787109375, + "completion_length": 31.979167938232422, + "epoch": 5.736, + "grad_norm": 1.341039622679155, + "kl": 0.4580078125, "learning_rate": 9.467075536135785e-08, - "loss": 0.0007, - "reward": 0.6250000298023224, - "reward_std": 0.16623876243829727, - "rewards/correct_code_reward_func": 0.1250000037252903, + "loss": 0.0005, + "reward": 0.6041666865348816, + "reward_std": 0.14801263809204102, + "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.5, "step": 361 }, { - "completion_length": 41.93750190734863, - "epoch": 5.8, - "grad_norm": 0.8387212625747497, - "kl": 0.23095703125, + "completion_length": 61.875, + "epoch": 5.752, + "grad_norm": 6.078841666686185, + "kl": 2.7958984375, "learning_rate": 9.340514442695952e-08, - "loss": 0.0002, - "reward": 0.9375000596046448, - "reward_std": 0.2041093111038208, - "rewards/correct_code_reward_func": 0.4375000149011612, - "rewards/len_reward_func": 0.5, + "loss": 0.0028, + "reward": 0.7839381992816925, + "reward_std": 0.23519539088010788, + "rewards/correct_code_reward_func": 0.3125000074505806, + "rewards/len_reward_func": 0.4714381694793701, "step": 362 }, { - "completion_length": 53.395835876464844, - "epoch": 5.816, - "grad_norm": 0.5998290170212329, - "kl": 1.482421875, + "completion_length": 29.25, + "epoch": 5.768, + "grad_norm": 0.5338505194219121, + "kl": 0.32421875, "learning_rate": 9.214610389907326e-08, - "loss": 0.0015, - "reward": 0.7685093283653259, - "reward_std": 0.49610376358032227, - "rewards/correct_code_reward_func": 0.3125, - "rewards/len_reward_func": 0.45600931346416473, + "loss": 0.0003, + "reward": 0.7083333730697632, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.2083333395421505, + "rewards/len_reward_func": 0.5, "step": 363 }, { - "completion_length": 58.54166793823242, - "epoch": 5.832, - "grad_norm": 0.7846225063037306, - "kl": 1.11376953125, + "completion_length": 34.52083396911621, + "epoch": 5.784, + "grad_norm": 1.9529867884255245, + "kl": 0.2939453125, "learning_rate": 9.089368660451798e-08, - "loss": 0.0011, - "reward": 0.6875, - "reward_std": 0.13607725501060486, - "rewards/correct_code_reward_func": 0.1875, + "loss": 0.0003, + "reward": 0.8541666865348816, + "reward_std": 0.23144195601344109, + "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.5, "step": 364 }, { - "completion_length": 44.33333396911621, - "epoch": 5.848, - "grad_norm": 2.4593358833922423, - "kl": 0.4462890625, + "completion_length": 36.06250190734863, + "epoch": 5.8, + "grad_norm": 4.331896756750715, + "kl": 0.27197265625, "learning_rate": 8.964794509221507e-08, - "loss": 0.0004, - "reward": 0.8750000298023224, - "reward_std": 0.3521219715476036, - "rewards/correct_code_reward_func": 0.3958333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0003, + "reward": 1.1458333730697632, + "reward_std": 0.175345279276371, + "rewards/correct_code_reward_func": 0.6458333432674408, + "rewards/len_reward_func": 0.5, "step": 365 }, { - "completion_length": 80.77083396911621, - "epoch": 5.864, - "grad_norm": 0.6173937823092938, - "kl": 0.2802734375, + "completion_length": 30.479166984558105, + "epoch": 5.816, + "grad_norm": 1.170767590853997, + "kl": 5.4716796875, "learning_rate": 8.840893163098332e-08, - "loss": 0.0003, - "reward": 0.7708333432674408, - "reward_std": 0.1480126492679119, - "rewards/correct_code_reward_func": 0.27083333395421505, + "loss": 0.0055, + "reward": 0.8333333432674408, + "reward_std": 0.2342708557844162, + "rewards/correct_code_reward_func": 0.3333333358168602, "rewards/len_reward_func": 0.5, "step": 366 }, { - "completion_length": 35.89583396911621, - "epoch": 5.88, - "grad_norm": 1.0790029740906348, - "kl": 0.20703125, + "completion_length": 36.854166984558105, + "epoch": 5.832, + "grad_norm": 4.742230256398718, + "kl": 0.53125, "learning_rate": 8.717669820734619e-08, - "loss": 0.0002, - "reward": 1.2291667461395264, - "reward_std": 0.14801263809204102, - "rewards/correct_code_reward_func": 0.7291666865348816, + "loss": 0.0005, + "reward": 0.8750000298023224, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 367 }, { - "completion_length": 59.020835876464844, - "epoch": 5.896, - "grad_norm": 0.5169480521892297, - "kl": 1.3037109375, + "completion_length": 29.041667938232422, + "epoch": 5.848, + "grad_norm": 3.0949748155814967, + "kl": 0.8212890625, "learning_rate": 8.595129652335017e-08, - "loss": 0.0013, - "reward": 0.8125000298023224, - "reward_std": 0.13607724383473396, - "rewards/correct_code_reward_func": 0.31250002048909664, - "rewards/len_reward_func": 0.5, + "loss": 0.0008, + "reward": 0.8174912929534912, + "reward_std": 0.30320997536182404, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.484157919883728, "step": 368 }, { - "completion_length": 40.52083396911621, - "epoch": 5.912, - "grad_norm": 1.4524047141423366, - "kl": 2.77392578125, + "completion_length": 42.04166793823242, + "epoch": 5.864, + "grad_norm": 2.5673081335228805, + "kl": 0.25244140625, "learning_rate": 8.473277799439568e-08, - "loss": 0.0028, - "reward": 0.8333333432674408, - "reward_std": 0.36751921474933624, - "rewards/correct_code_reward_func": 0.3333333358168602, + "loss": 0.0003, + "reward": 0.7916666865348816, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.5, "step": 369 }, { - "completion_length": 57.31250190734863, - "epoch": 5.928, - "grad_norm": 0.623572783170388, - "kl": 0.3564453125, + "completion_length": 21.416666984558105, + "epoch": 5.88, + "grad_norm": 8.714583282604275, + "kl": 0.5546875, "learning_rate": 8.352119374707977e-08, - "loss": 0.0004, - "reward": 0.7083333432674408, - "reward_std": 0.19500282034277916, - "rewards/correct_code_reward_func": 0.2083333432674408, + "loss": 0.0006, + "reward": 1.2916666865348816, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.7916666865348816, "rewards/len_reward_func": 0.5, "step": 370 }, { - "completion_length": 65.50000381469727, - "epoch": 5.944, - "grad_norm": 0.9283491589642439, - "kl": 1.005859375, + "completion_length": 45.1875, + "epoch": 5.896, + "grad_norm": 3.7959594929992884, + "kl": 0.26953125, "learning_rate": 8.23165946170509e-08, - "loss": 0.001, - "reward": 0.5236008167266846, - "reward_std": 0.20291338860988617, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.46110081672668457, + "loss": 0.0003, + "reward": 0.875, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.375, + "rewards/len_reward_func": 0.5, "step": 371 }, { - "completion_length": 46.458335876464844, - "epoch": 5.96, - "grad_norm": 0.53710215454902, - "kl": 2.33349609375, + "completion_length": 33.22916793823242, + "epoch": 5.912, + "grad_norm": 2.6612237762750697, + "kl": 0.349609375, "learning_rate": 8.11190311468759e-08, - "loss": 0.0023, - "reward": 0.791666716337204, - "reward_std": 0.19500282034277916, - "rewards/correct_code_reward_func": 0.29166667722165585, + "loss": 0.0004, + "reward": 1.1666666865348816, + "reward_std": 0.2342708334326744, + "rewards/correct_code_reward_func": 0.6666666716337204, "rewards/len_reward_func": 0.5, "step": 372 }, { - "completion_length": 40.33333396911621, - "epoch": 5.976, - "grad_norm": 1.2192231385916954, - "kl": 0.4990234375, + "completion_length": 36.47916793823242, + "epoch": 5.928, + "grad_norm": 1.6123592594099534, + "kl": 0.23291015625, "learning_rate": 7.992855358391967e-08, - "loss": 0.0005, - "reward": 0.8691092431545258, - "reward_std": 0.30702924728393555, + "loss": 0.0002, + "reward": 0.8750000298023224, + "reward_std": 0.20693820342421532, "rewards/correct_code_reward_func": 0.3750000149011612, - "rewards/len_reward_func": 0.4941091984510422, + "rewards/len_reward_func": 0.5, "step": 373 }, { - "completion_length": 55.29166793823242, - "epoch": 5.992, - "grad_norm": 0.8470940290984582, - "kl": 0.1962890625, + "completion_length": 44.54166793823242, + "epoch": 5.944, + "grad_norm": 7.71060276672806, + "kl": 0.27490234375, "learning_rate": 7.87452118782363e-08, - "loss": 0.0002, - "reward": 0.7673184722661972, - "reward_std": 0.14601882547140121, - "rewards/correct_code_reward_func": 0.2916666865348816, - "rewards/len_reward_func": 0.4756517857313156, + "loss": 0.0003, + "reward": 0.791666716337204, + "reward_std": 0.1451837606728077, + "rewards/correct_code_reward_func": 0.29166667722165585, + "rewards/len_reward_func": 0.5, "step": 374 }, { - "completion_length": 37.083335876464844, - "epoch": 6.0, - "grad_norm": 0.8265558358146432, - "kl": 0.361328125, + "completion_length": 29.4375, + "epoch": 5.96, + "grad_norm": 2.403326466418192, + "kl": 0.6865234375, "learning_rate": 7.756905568047392e-08, - "loss": 0.0002, - "reward": 1.125, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.625, + "loss": 0.0007, + "reward": 0.854166716337204, + "reward_std": 0.25392838940024376, + "rewards/correct_code_reward_func": 0.35416667722165585, "rewards/len_reward_func": 0.5, "step": 375 }, { - "completion_length": 72.37500381469727, - "epoch": 6.016, - "grad_norm": 0.9354150737741886, - "kl": 1.421875, + "completion_length": 22.916667938232422, + "epoch": 5.976, + "grad_norm": 1.6436335508907451, + "kl": 1.37890625, "learning_rate": 7.640013433979093e-08, "loss": 0.0014, - "reward": 0.6424994468688965, - "reward_std": 0.2443188726902008, - "rewards/correct_code_reward_func": 0.2708333358168602, - "rewards/len_reward_func": 0.3716660961508751, + "reward": 0.9583333730697632, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.458333358168602, + "rewards/len_reward_func": 0.5, "step": 376 }, { - "completion_length": 47.04166793823242, - "epoch": 6.032, - "grad_norm": 0.9061876886193456, - "kl": 2.72265625, + "completion_length": 34.08333396911621, + "epoch": 5.992, + "grad_norm": 5.19625049583634, + "kl": 0.3408203125, "learning_rate": 7.523849690178566e-08, - "loss": 0.0027, - "reward": 1.0833333730697632, - "reward_std": 0.19500282034277916, - "rewards/correct_code_reward_func": 0.5833333432674408, + "loss": 0.0003, + "reward": 0.8958333730697632, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 377 }, { - "completion_length": 60.85416793823242, - "epoch": 6.048, - "grad_norm": 0.6283396491998571, - "kl": 0.57275390625, + "completion_length": 18.916667938232422, + "epoch": 6.0, + "grad_norm": 0.03498244958553028, + "kl": 0.35546875, "learning_rate": 7.408419210643846e-08, - "loss": 0.0006, - "reward": 0.6338140964508057, - "reward_std": 0.316009059548378, - "rewards/correct_code_reward_func": 0.1666666679084301, - "rewards/len_reward_func": 0.46714743971824646, + "loss": 0.0002, + "reward": 1.1666667461395264, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.6666666865348816, + "rewards/len_reward_func": 0.5, "step": 378 }, { - "completion_length": 67.72916793823242, - "epoch": 6.064, - "grad_norm": 0.8372798612259558, - "kl": 0.125244140625, + "completion_length": 37.250000953674316, + "epoch": 6.016, + "grad_norm": 5.584972835115381, + "kl": 2.3876953125, "learning_rate": 7.293726838606673e-08, - "loss": 0.0001, - "reward": 0.7893698513507843, - "reward_std": 0.3696397468447685, - "rewards/correct_code_reward_func": 0.3125000111758709, - "rewards/len_reward_func": 0.4768698215484619, + "loss": 0.0024, + "reward": 0.8750000596046448, + "reward_std": 0.2994871214032173, + "rewards/correct_code_reward_func": 0.3750000149011612, + "rewards/len_reward_func": 0.5, "step": 379 }, { - "completion_length": 46.72916793823242, - "epoch": 6.08, - "grad_norm": 0.9275523238857312, - "kl": 0.4658203125, + "completion_length": 32.1875, + "epoch": 6.032, + "grad_norm": 2.6356609513901277, + "kl": 0.33203125, "learning_rate": 7.179777386329275e-08, - "loss": 0.0005, - "reward": 0.8125000298023224, - "reward_std": 0.3430154323577881, - "rewards/correct_code_reward_func": 0.3333333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0003, + "reward": 1.1250000596046448, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.6250000298023224, + "rewards/len_reward_func": 0.5, "step": 380 }, { - "completion_length": 54.87500190734863, - "epoch": 6.096, - "grad_norm": 6.268509007657519, - "kl": 4.9619140625, + "completion_length": 32.60416793823242, + "epoch": 6.048, + "grad_norm": 1.625137414185801, + "kl": 0.345703125, "learning_rate": 7.066575634902435e-08, - "loss": 0.0049, - "reward": 0.8333333730697632, - "reward_std": 0.15430335700511932, - "rewards/correct_code_reward_func": 0.3333333432674408, + "loss": 0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.17251639068126678, + "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.5, "step": 381 }, { - "completion_length": 58.04166793823242, - "epoch": 6.112, - "grad_norm": 0.7207271037372853, - "kl": 0.197265625, + "completion_length": 45.770835876464844, + "epoch": 6.064, + "grad_norm": 1.869234906894, + "kl": 0.248046875, "learning_rate": 6.954126334044949e-08, "loss": 0.0002, - "reward": 0.6414414346218109, - "reward_std": 0.14849938824772835, - "rewards/correct_code_reward_func": 0.14583333395421505, - "rewards/len_reward_func": 0.4956081211566925, + "reward": 0.9583333432674408, + "reward_std": 0.320542111992836, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.5, "step": 382 }, { - "completion_length": 70.14583587646484, - "epoch": 6.128, - "grad_norm": 0.8405760325659827, - "kl": 0.47216796875, + "completion_length": 32.89583396911621, + "epoch": 6.08, + "grad_norm": 4.898274958092342, + "kl": 0.26123046875, "learning_rate": 6.842434201904255e-08, - "loss": 0.0005, - "reward": 0.7068032324314117, - "reward_std": 0.08147954940795898, - "rewards/correct_code_reward_func": 0.2291666679084301, - "rewards/len_reward_func": 0.47763654589653015, + "loss": 0.0003, + "reward": 0.9791666865348816, + "reward_std": 0.4130779355764389, + "rewards/correct_code_reward_func": 0.4791666716337204, + "rewards/len_reward_func": 0.5, "step": 383 }, { - "completion_length": 60.66666793823242, - "epoch": 6.144, - "grad_norm": 0.7554968229024656, - "kl": 0.412109375, + "completion_length": 39.60416793823242, + "epoch": 6.096, + "grad_norm": 2.7651784941334285, + "kl": 0.3056640625, "learning_rate": 6.731503924858516e-08, - "loss": 0.0004, - "reward": 0.8333333730697632, - "reward_std": 0.15430335700511932, - "rewards/correct_code_reward_func": 0.3333333432674408, + "loss": 0.0003, + "reward": 1.0833333730697632, + "reward_std": 0.20693820342421532, + "rewards/correct_code_reward_func": 0.583333358168602, "rewards/len_reward_func": 0.5, "step": 384 }, { - "completion_length": 38.8125, - "epoch": 6.16, - "grad_norm": 1.1056222668520357, - "kl": 1.001953125, + "completion_length": 44.93750190734863, + "epoch": 6.112, + "grad_norm": 4.863219550347464, + "kl": 6.7158203125, "learning_rate": 6.621340157319996e-08, - "loss": 0.001, - "reward": 0.8958333730697632, - "reward_std": 0.14801263809204102, - "rewards/correct_code_reward_func": 0.3958333432674408, + "loss": 0.0067, + "reward": 0.6875000298023224, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.18750000558793545, "rewards/len_reward_func": 0.5, "step": 385 }, { - "completion_length": 61.708335876464844, - "epoch": 6.176, - "grad_norm": 0.3838161763098325, - "kl": 0.23974609375, + "completion_length": 42.58333396911621, + "epoch": 6.128, + "grad_norm": 2.178001546252181, + "kl": 0.287109375, "learning_rate": 6.511947521539737e-08, - "loss": 0.0002, - "reward": 0.5833333432674408, - "reward_std": 0.08908708393573761, - "rewards/correct_code_reward_func": 0.0833333358168602, + "loss": 0.0003, + "reward": 0.8125000298023224, + "reward_std": 0.3142513930797577, + "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.5, "step": 386 }, { - "completion_length": 57.520835876464844, - "epoch": 6.192, - "grad_norm": 0.6603039357563164, - "kl": 0.1806640625, + "completion_length": 22.979166984558105, + "epoch": 6.144, + "grad_norm": 1.6068071891620737, + "kl": 0.37109375, "learning_rate": 6.403330607413643e-08, - "loss": 0.0002, - "reward": 0.7342728972434998, - "reward_std": 0.26681847125291824, - "rewards/correct_code_reward_func": 0.2500000111758709, - "rewards/len_reward_func": 0.48427288234233856, + "loss": 0.0004, + "reward": 0.9583333432674408, + "reward_std": 0.28408990800380707, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.5, "step": 387 }, { - "completion_length": 40.062500953674316, - "epoch": 6.208, - "grad_norm": 2.799261156324464, - "kl": 0.681640625, + "completion_length": 23.604166984558105, + "epoch": 6.16, + "grad_norm": 3.9928344479996607, + "kl": 0.3447265625, "learning_rate": 6.295493972289903e-08, - "loss": 0.0007, - "reward": 1.125, - "reward_std": 0.2721545100212097, - "rewards/correct_code_reward_func": 0.625, + "loss": 0.0003, + "reward": 1.0000000596046448, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.5000000149011612, "rewards/len_reward_func": 0.5, "step": 388 }, { - "completion_length": 84.625, - "epoch": 6.224, - "grad_norm": 0.7649357234507214, - "kl": 0.185546875, + "completion_length": 33.89583396911621, + "epoch": 6.176, + "grad_norm": 2.632692948523292, + "kl": 0.3388671875, "learning_rate": 6.188442140777742e-08, - "loss": 0.0002, - "reward": 0.7266082167625427, - "reward_std": 0.2673238180577755, - "rewards/correct_code_reward_func": 0.2500000149011612, - "rewards/len_reward_func": 0.47660820186138153, + "loss": 0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.15430335700511932, + "rewards/correct_code_reward_func": 0.1666666679084301, + "rewards/len_reward_func": 0.5, "step": 389 }, { - "completion_length": 101.83333969116211, - "epoch": 6.24, - "grad_norm": 0.8017473502069168, - "kl": 2.1689453125, + "completion_length": 28.166667938232422, + "epoch": 6.192, + "grad_norm": 8.350787780570107, + "kl": 5.517578125, "learning_rate": 6.082179604557616e-08, - "loss": 0.0022, - "reward": 0.6834983229637146, - "reward_std": 0.23408660292625427, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.4959982931613922, + "loss": 0.0055, + "reward": 1.0416666865348816, + "reward_std": 0.1451837718486786, + "rewards/correct_code_reward_func": 0.5416666716337204, + "rewards/len_reward_func": 0.5, "step": 390 }, { - "completion_length": 38.85416793823242, - "epoch": 6.256, - "grad_norm": 0.9679844099282252, - "kl": 0.341796875, + "completion_length": 19.979166984558105, + "epoch": 6.208, + "grad_norm": 21.31957241329922, + "kl": 0.837890625, "learning_rate": 5.976710822192721e-08, - "loss": 0.0003, - "reward": 1.1875000596046448, - "reward_std": 0.40680031478405, - "rewards/correct_code_reward_func": 0.6875000298023224, + "loss": 0.0008, + "reward": 1.1666666865348816, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.6666666716337204, "rewards/len_reward_func": 0.5, "step": 391 }, { - "completion_length": 46.770835876464844, - "epoch": 6.272, - "grad_norm": 1.5409730906502774, - "kl": 0.25048828125, + "completion_length": 46.25000190734863, + "epoch": 6.224, + "grad_norm": 2.4315634778894886, + "kl": 0.30615234375, "learning_rate": 5.8720402189419286e-08, "loss": 0.0003, - "reward": 1.1041666865348816, - "reward_std": 0.3584126979112625, - "rewards/correct_code_reward_func": 0.6041666865348816, - "rewards/len_reward_func": 0.5, + "reward": 0.6666666865348816, + "reward_std": 0.2342708334326744, + "rewards/correct_code_reward_func": 0.1875000074505806, + "rewards/len_reward_func": 0.4791666716337204, "step": 392 }, { - "completion_length": 57.583335876464844, - "epoch": 6.288, - "grad_norm": 0.9113319000870377, - "kl": 0.279296875, + "completion_length": 79.85416793823242, + "epoch": 6.24, + "grad_norm": 2.8323358281325715, + "kl": 3.1953125, "learning_rate": 5.768172186574122e-08, - "loss": 0.0003, - "reward": 0.9166666865348816, - "reward_std": 0.2840898931026459, - "rewards/correct_code_reward_func": 0.4166666716337204, - "rewards/len_reward_func": 0.5, + "loss": 0.0032, + "reward": 0.7708333432674408, + "reward_std": 0.28126102685928345, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.4791666716337204, "step": 393 }, { - "completion_length": 31.979167938232422, - "epoch": 6.304, - "grad_norm": 0.8665463952343896, - "kl": 0.6875, + "completion_length": 26.14583396911621, + "epoch": 6.256, + "grad_norm": 6.919623616583594, + "kl": 1.27734375, "learning_rate": 5.6651110831839046e-08, - "loss": 0.0007, - "reward": 1.0833333730697632, - "reward_std": 0.19500280916690826, - "rewards/correct_code_reward_func": 0.5833333432674408, + "loss": 0.0013, + "reward": 1.2291667461395264, + "reward_std": 0.28126100450754166, + "rewards/correct_code_reward_func": 0.7291666865348816, "rewards/len_reward_func": 0.5, "step": 394 }, { - "completion_length": 52.020835876464844, - "epoch": 6.32, - "grad_norm": 0.7456611150964201, - "kl": 0.36572265625, + "completion_length": 24.5, + "epoch": 6.272, + "grad_norm": 30.75145450155544, + "kl": 1.65966796875, "learning_rate": 5.5628612330087724e-08, - "loss": 0.0004, - "reward": 1.0090517401695251, - "reward_std": 0.3510366529226303, - "rewards/correct_code_reward_func": 0.520833358168602, - "rewards/len_reward_func": 0.48821839690208435, + "loss": 0.0017, + "reward": 1.1250000298023224, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.6250000298023224, + "rewards/len_reward_func": 0.5, "step": 395 }, { - "completion_length": 56.729169845581055, - "epoch": 6.336, - "grad_norm": 0.6143240154523756, - "kl": 0.400390625, + "completion_length": 44.16666793823242, + "epoch": 6.288, + "grad_norm": 2.646012784054006, + "kl": 0.2724609375, "learning_rate": 5.461426926247639e-08, - "loss": 0.0004, - "reward": 0.7686998248100281, - "reward_std": 0.21476645022630692, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.4978664815425873, + "loss": 0.0003, + "reward": 0.8958333730697632, + "reward_std": 0.22516431659460068, + "rewards/correct_code_reward_func": 0.395833358168602, + "rewards/len_reward_func": 0.5, "step": 396 }, { - "completion_length": 87.5, - "epoch": 6.352, - "grad_norm": 0.8261937605646208, - "kl": 0.2294921875, + "completion_length": 18.416667938232422, + "epoch": 6.304, + "grad_norm": 1.4581978595368466, + "kl": 0.4716796875, "learning_rate": 5.360812418880883e-08, - "loss": 0.0002, - "reward": 0.8125000298023224, - "reward_std": 0.1767766959965229, - "rewards/correct_code_reward_func": 0.3125000149011612, + "loss": 0.0005, + "reward": 1.1041666865348816, + "reward_std": 0.2041093371808529, + "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.5, "step": 397 }, { - "completion_length": 51.29166793823242, - "epoch": 6.368, - "grad_norm": 5.2800268965174455, - "kl": 3.046875, + "completion_length": 48.479169845581055, + "epoch": 6.32, + "grad_norm": 152.66305944633206, + "kl": 58.0390625, "learning_rate": 5.261021932491713e-08, - "loss": 0.0031, - "reward": 0.854166716337204, - "reward_std": 0.39486490190029144, - "rewards/correct_code_reward_func": 0.354166679084301, + "loss": 0.0578, + "reward": 0.9166666865348816, + "reward_std": 0.30860670655965805, + "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.5, "step": 398 }, { - "completion_length": 70.02083587646484, - "epoch": 6.384, - "grad_norm": 0.6432565856269403, - "kl": 0.21337890625, + "completion_length": 32.729166984558105, + "epoch": 6.336, + "grad_norm": 7.835361687841303, + "kl": 0.435546875, "learning_rate": 5.162059654089082e-08, - "loss": 0.0002, - "reward": 0.9985465407371521, - "reward_std": 0.2537791579961777, - "rewards/correct_code_reward_func": 0.5, - "rewards/len_reward_func": 0.4985465258359909, + "loss": 0.0004, + "reward": 0.7291666865348816, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.2291666716337204, + "rewards/len_reward_func": 0.5, "step": 399 }, { - "completion_length": 41.12500190734863, - "epoch": 6.4, - "grad_norm": 1.6906176432449476, - "kl": 0.2939453125, + "completion_length": 71.20833587646484, + "epoch": 6.352, + "grad_norm": 1.2458790767126353, + "kl": 2.98828125, "learning_rate": 5.0639297359319846e-08, - "loss": 0.0003, - "reward": 1.0833333730697632, - "reward_std": 0.27215447276830673, - "rewards/correct_code_reward_func": 0.5833333730697632, + "loss": 0.003, + "reward": 0.8958333730697632, + "reward_std": 0.30859363824129105, + "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 400 }, { - "completion_length": 41.10416793823242, - "epoch": 6.416, - "grad_norm": 2.8540588945775758, - "kl": 11.216796875, + "completion_length": 35.10416793823242, + "epoch": 6.368, + "grad_norm": 35.48307958654518, + "kl": 22.49560546875, "learning_rate": 4.9666362953552534e-08, - "loss": 0.0113, - "reward": 1.1666666865348816, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.6666666865348816, + "loss": 0.0224, + "reward": 0.8958333432674408, + "reward_std": 0.28126100823283195, + "rewards/correct_code_reward_func": 0.39583333395421505, "rewards/len_reward_func": 0.5, "step": 401 }, { - "completion_length": 47.33333396911621, - "epoch": 6.432, - "grad_norm": 1.1110766072601825, - "kl": 0.32373046875, + "completion_length": 33.83333396911621, + "epoch": 6.384, + "grad_norm": 1.9125737436183505, + "kl": 0.8876953125, "learning_rate": 4.870183414596793e-08, - "loss": 0.0003, - "reward": 1.0416666865348816, - "reward_std": 0.3794545978307724, - "rewards/correct_code_reward_func": 0.5625000149011612, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0009, + "reward": 1.0000000298023224, + "reward_std": 0.2342708557844162, + "rewards/correct_code_reward_func": 0.5000000223517418, + "rewards/len_reward_func": 0.5, "step": 402 }, { - "completion_length": 51.64583396911621, - "epoch": 6.448, - "grad_norm": 1.2730076801023489, - "kl": 0.3427734375, + "completion_length": 28.041667938232422, + "epoch": 6.4, + "grad_norm": 1.72406073879926, + "kl": 0.3076171875, "learning_rate": 4.774575140626316e-08, "loss": 0.0003, - "reward": 0.6766856610774994, - "reward_std": 0.2351214364171028, - "rewards/correct_code_reward_func": 0.18750000558793545, - "rewards/len_reward_func": 0.489185631275177, + "reward": 1.2708333730697632, + "reward_std": 0.16340987384319305, + "rewards/correct_code_reward_func": 0.7708333432674408, + "rewards/len_reward_func": 0.5, "step": 403 }, { - "completion_length": 74.0, - "epoch": 6.464, - "grad_norm": 0.7869264398639599, - "kl": 0.2353515625, + "completion_length": 25.5625, + "epoch": 6.416, + "grad_norm": 1.7825264711628441, + "kl": 0.2783203125, "learning_rate": 4.679815484975505e-08, - "loss": 0.0002, - "reward": 0.8958333730697632, - "reward_std": 0.37034809589385986, - "rewards/correct_code_reward_func": 0.4166666865348816, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0003, + "reward": 1.2083333730697632, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.7083333432674408, + "rewards/len_reward_func": 0.5, "step": 404 }, { - "completion_length": 38.916666984558105, - "epoch": 6.48, - "grad_norm": 0.2826077608435409, - "kl": 0.30712890625, + "completion_length": 24.250000953674316, + "epoch": 6.432, + "grad_norm": 18.631693384669017, + "kl": 3.8271484375, "learning_rate": 4.5859084235697235e-08, - "loss": 0.0003, - "reward": 1.0208333432674408, - "reward_std": 0.0589255653321743, - "rewards/correct_code_reward_func": 0.520833333954215, + "loss": 0.0038, + "reward": 1.2708333730697632, + "reward_std": 0.3584126979112625, + "rewards/correct_code_reward_func": 0.7708333432674408, "rewards/len_reward_func": 0.5, "step": 405 }, { - "completion_length": 59.66666793823242, - "epoch": 6.496, - "grad_norm": 0.9784881454579527, - "kl": 0.97265625, + "completion_length": 34.60416793823242, + "epoch": 6.448, + "grad_norm": 11634.914105010444, + "kl": 4704.2080078125, "learning_rate": 4.492857896561203e-08, - "loss": 0.001, - "reward": 0.875, - "reward_std": 0.3885742127895355, - "rewards/correct_code_reward_func": 0.375, - "rewards/len_reward_func": 0.5, + "loss": 4.6919, + "reward": 0.6458333432674408, + "reward_std": 0.3430154621601105, + "rewards/correct_code_reward_func": 0.1666666679084301, + "rewards/len_reward_func": 0.4791666716337204, "step": 406 }, { - "completion_length": 39.29166793823242, - "epoch": 6.5120000000000005, - "grad_norm": 1.1304528045640831, - "kl": 1.50390625, + "completion_length": 48.47916793823242, + "epoch": 6.464, + "grad_norm": 3.031752468264475, + "kl": 0.5361328125, "learning_rate": 4.4006678081636885e-08, - "loss": 0.0015, - "reward": 0.770833358168602, - "reward_std": 0.2931964062154293, - "rewards/correct_code_reward_func": 0.2916666865348816, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0005, + "reward": 0.8750000298023224, + "reward_std": 0.3493061512708664, + "rewards/correct_code_reward_func": 0.3750000223517418, + "rewards/len_reward_func": 0.5, "step": 407 }, { - "completion_length": 44.75, - "epoch": 6.5280000000000005, - "grad_norm": 0.8162960673748474, - "kl": 0.220703125, + "completion_length": 19.562500953674316, + "epoch": 6.48, + "grad_norm": 6.574838245332442, + "kl": 0.373046875, "learning_rate": 4.309342026488652e-08, - "loss": 0.0002, - "reward": 0.6666666865348816, + "loss": 0.0004, + "reward": 1.1250000298023224, "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.16666667722165585, + "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.5, "step": 408 }, { - "completion_length": 66.45833587646484, - "epoch": 6.5440000000000005, - "grad_norm": 1.1621024552387473, - "kl": 0.20947265625, + "completion_length": 38.375, + "epoch": 6.496, + "grad_norm": 7.824251153117081, + "kl": 0.3798828125, "learning_rate": 4.218884383382987e-08, - "loss": 0.0002, - "reward": 1.1875000596046448, - "reward_std": 0.25249695032835007, - "rewards/correct_code_reward_func": 0.7083333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0004, + "reward": 1.0416666865348816, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.5416666716337204, + "rewards/len_reward_func": 0.5, "step": 409 }, { - "completion_length": 55.93750190734863, - "epoch": 6.5600000000000005, - "grad_norm": 1.0567060151289225, - "kl": 1.150390625, + "completion_length": 15.979166984558105, + "epoch": 6.5120000000000005, + "grad_norm": 8.572852559476349, + "kl": 0.2822265625, "learning_rate": 4.1292986742682254e-08, - "loss": 0.0011, - "reward": 0.8133013248443604, - "reward_std": 0.28758829087018967, - "rewards/correct_code_reward_func": 0.3333333432674408, - "rewards/len_reward_func": 0.47996795177459717, + "loss": 0.0003, + "reward": 0.7083333432674408, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.2083333432674408, + "rewards/len_reward_func": 0.5, "step": 410 }, { - "completion_length": 110.70833587646484, - "epoch": 6.576, - "grad_norm": 0.7247869720705408, - "kl": 0.4423828125, + "completion_length": 27.604167938232422, + "epoch": 6.5280000000000005, + "grad_norm": 1.5853029349225285, + "kl": 0.3447265625, "learning_rate": 4.0405886579813006e-08, - "loss": 0.0004, - "reward": 0.5973389148712158, - "reward_std": 0.2702496573328972, - "rewards/correct_code_reward_func": 0.12500000558793545, - "rewards/len_reward_func": 0.4723389446735382, + "loss": 0.0003, + "reward": 0.7708333730697632, + "reward_std": 0.204109326004982, + "rewards/correct_code_reward_func": 0.2708333395421505, + "rewards/len_reward_func": 0.5, "step": 411 }, { - "completion_length": 79.91666793823242, - "epoch": 6.592, - "grad_norm": 1.076683340641622, - "kl": 0.2041015625, + "completion_length": 20.062500953674316, + "epoch": 6.5440000000000005, + "grad_norm": 5.332547426817105, + "kl": 0.6708984375, "learning_rate": 3.952758056616826e-08, - "loss": 0.0002, - "reward": 0.7023313045501709, - "reward_std": 0.18949268758296967, - "rewards/correct_code_reward_func": 0.2083333432674408, - "rewards/len_reward_func": 0.4939979910850525, + "loss": 0.0007, + "reward": 1.2916666865348816, + "reward_std": 0.352121964097023, + "rewards/correct_code_reward_func": 0.7916666865348816, + "rewards/len_reward_func": 0.5, "step": 412 }, { - "completion_length": 72.02083587646484, - "epoch": 6.608, - "grad_norm": 0.3773136909553423, - "kl": 0.3935546875, + "completion_length": 44.04166793823242, + "epoch": 6.5600000000000005, + "grad_norm": 4.316518711611452, + "kl": 0.43359375, "learning_rate": 3.8658105553709353e-08, "loss": 0.0004, - "reward": 0.4791666716337204, - "reward_std": 0.176776684820652, - "rewards/correct_code_reward_func": 0.02083333395421505, - "rewards/len_reward_func": 0.4583333432674408, + "reward": 0.9375, + "reward_std": 0.28126102685928345, + "rewards/correct_code_reward_func": 0.4375, + "rewards/len_reward_func": 0.5, "step": 413 }, { - "completion_length": 56.85416793823242, - "epoch": 6.624, - "grad_norm": 375.29171284810195, - "kl": 828.138671875, + "completion_length": 79.66666793823242, + "epoch": 6.576, + "grad_norm": 2.813253474512694, + "kl": 0.14697265625, "learning_rate": 3.7797498023866395e-08, - "loss": 0.8302, - "reward": 0.9791666865348816, - "reward_std": 0.24056155234575272, - "rewards/correct_code_reward_func": 0.4791666865348816, + "loss": 0.0001, + "reward": 0.8125000298023224, + "reward_std": 0.3205290511250496, + "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.5, "step": 414 }, { - "completion_length": 51.97916793823242, - "epoch": 6.64, - "grad_norm": 2.5089065222061544, - "kl": 9.57421875, + "completion_length": 67.625, + "epoch": 6.592, + "grad_norm": 4.588432940227484, + "kl": 0.67236328125, "learning_rate": 3.6945794086007705e-08, - "loss": 0.0096, - "reward": 0.8333333730697632, - "reward_std": 0.24339044094085693, - "rewards/correct_code_reward_func": 0.3541666716337204, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0007, + "reward": 0.7916666865348816, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.2916666716337204, + "rewards/len_reward_func": 0.5, "step": 415 }, { - "completion_length": 54.14583396911621, - "epoch": 6.656, - "grad_norm": 0.903837921044488, - "kl": 0.482421875, + "completion_length": 46.31250190734863, + "epoch": 6.608, + "grad_norm": 5.238227799827805, + "kl": 1.3623046875, "learning_rate": 3.6103029475924727e-08, - "loss": 0.0005, - "reward": 0.8541666865348816, - "reward_std": 0.16340987384319305, - "rewards/correct_code_reward_func": 0.3541666865348816, - "rewards/len_reward_func": 0.5, + "loss": 0.0014, + "reward": 0.520833358168602, + "reward_std": 0.13607724383473396, + "rewards/correct_code_reward_func": 0.0416666679084301, + "rewards/len_reward_func": 0.4791666716337204, "step": 416 }, { - "completion_length": 55.04166793823242, - "epoch": 6.672, - "grad_norm": 0.4526442828695266, - "kl": 1.1484375, + "completion_length": 36.47916793823242, + "epoch": 6.624, + "grad_norm": 2.438161085387164, + "kl": 0.93359375, "learning_rate": 3.5269239554332556e-08, - "loss": 0.0012, - "reward": 0.7500000298023224, - "reward_std": 0.18292953446507454, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0009, + "reward": 0.95250004529953, + "reward_std": 0.16168292984366417, + "rewards/correct_code_reward_func": 0.458333358168602, + "rewards/len_reward_func": 0.49416667222976685, "step": 417 }, { - "completion_length": 47.083335876464844, - "epoch": 6.688, - "grad_norm": 0.8688686439480761, - "kl": 0.29345703125, + "completion_length": 27.77083396911621, + "epoch": 6.64, + "grad_norm": 1.0466352275241257, + "kl": 0.5244140625, "learning_rate": 3.4444459305386504e-08, - "loss": 0.0003, - "reward": 1.0833333432674408, - "reward_std": 0.2994871214032173, - "rewards/correct_code_reward_func": 0.5833333432674408, + "loss": 0.0005, + "reward": 1.0416666865348816, + "reward_std": 0.20693820342421532, + "rewards/correct_code_reward_func": 0.5416666716337204, "rewards/len_reward_func": 0.5, "step": 418 }, { - "completion_length": 50.00000190734863, - "epoch": 6.704, - "grad_norm": 0.47124745120641315, - "kl": 0.3359375, + "completion_length": 29.479167938232422, + "epoch": 6.656, + "grad_norm": 7.370012766463159, + "kl": 0.2734375, "learning_rate": 3.362872333521388e-08, "loss": 0.0003, - "reward": 0.8125000298023224, - "reward_std": 0.23144195973873138, - "rewards/correct_code_reward_func": 0.3125000149011612, + "reward": 0.9166666865348816, + "reward_std": 0.20693820342421532, + "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.5, "step": 419 }, { - "completion_length": 51.0, - "epoch": 6.72, - "grad_norm": 0.8998754959006781, - "kl": 0.8076171875, + "completion_length": 33.27083396911621, + "epoch": 6.672, + "grad_norm": 2.8559830160043744, + "kl": 0.6416015625, "learning_rate": 3.2822065870462215e-08, - "loss": 0.0008, - "reward": 0.7434967756271362, - "reward_std": 0.1726972535252571, - "rewards/correct_code_reward_func": 0.25, - "rewards/len_reward_func": 0.49349677562713623, + "loss": 0.0006, + "reward": 0.8333333432674408, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.3333333432674408, + "rewards/len_reward_func": 0.5, "step": 420 }, { - "completion_length": 36.645835876464844, - "epoch": 6.736, - "grad_norm": 1.2268987042575794, - "kl": 0.349609375, + "completion_length": 31.041667938232422, + "epoch": 6.688, + "grad_norm": 20.155745472761566, + "kl": 2.296875, "learning_rate": 3.2024520756863236e-08, - "loss": 0.0004, - "reward": 1.0208333730697632, - "reward_std": 0.3584126681089401, - "rewards/correct_code_reward_func": 0.5208333432674408, + "loss": 0.0023, + "reward": 1.0833333730697632, + "reward_std": 0.4657258689403534, + "rewards/correct_code_reward_func": 0.583333358168602, "rewards/len_reward_func": 0.5, "step": 421 }, { - "completion_length": 79.91666793823242, - "epoch": 6.752, - "grad_norm": 0.9435178914816145, - "kl": 0.580078125, + "completion_length": 32.02083492279053, + "epoch": 6.704, + "grad_norm": 4.317130217269305, + "kl": 0.2734375, "learning_rate": 3.1236121457812545e-08, - "loss": 0.0006, - "reward": 0.64256352186203, - "reward_std": 0.1850866973400116, - "rewards/correct_code_reward_func": 0.1666666716337204, - "rewards/len_reward_func": 0.4758968651294708, + "loss": 0.0003, + "reward": 0.9375, + "reward_std": 0.1480126492679119, + "rewards/correct_code_reward_func": 0.4375, + "rewards/len_reward_func": 0.5, "step": 422 }, { - "completion_length": 67.08333396911621, - "epoch": 6.768, - "grad_norm": 0.4952778609269051, - "kl": 0.689453125, + "completion_length": 33.104166984558105, + "epoch": 6.72, + "grad_norm": 2.4144976734201777, + "kl": 0.26904296875, "learning_rate": 3.045690105296572e-08, - "loss": 0.0007, - "reward": 0.6536935269832611, - "reward_std": 0.15454470738768578, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.4661935269832611, + "loss": 0.0003, + "reward": 0.8541666865348816, + "reward_std": 0.1753452718257904, + "rewards/correct_code_reward_func": 0.3541666716337204, + "rewards/len_reward_func": 0.5, "step": 423 }, { - "completion_length": 54.875, - "epoch": 6.784, - "grad_norm": 0.5136413677717199, - "kl": 0.2109375, + "completion_length": 23.83333396911621, + "epoch": 6.736, + "grad_norm": 4.26144243485857, + "kl": 0.4951171875, "learning_rate": 2.9686892236850336e-08, - "loss": 0.0002, - "reward": 0.6041666865348816, - "reward_std": 0.25392838567495346, - "rewards/correct_code_reward_func": 0.1041666679084301, + "loss": 0.0005, + "reward": 0.8750000298023224, + "reward_std": 0.1451837718486786, + "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 424 }, { - "completion_length": 44.72916793823242, - "epoch": 6.8, - "grad_norm": 11.265343690585027, - "kl": 3.236328125, + "completion_length": 52.18750286102295, + "epoch": 6.752, + "grad_norm": 6.4098073522343375, + "kl": 0.239501953125, "learning_rate": 2.892612731749414e-08, - "loss": 0.0032, - "reward": 0.7916666865348816, - "reward_std": 0.19500282034277916, - "rewards/correct_code_reward_func": 0.2916666865348816, + "loss": 0.0002, + "reward": 0.7500000298023224, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.2500000111758709, "rewards/len_reward_func": 0.5, "step": 425 }, { - "completion_length": 84.125, - "epoch": 6.816, - "grad_norm": 0.666539390192808, - "kl": 0.279296875, + "completion_length": 32.3125, + "epoch": 6.768, + "grad_norm": 5.582717299169559, + "kl": 1.6337890625, "learning_rate": 2.817463821506949e-08, - "loss": 0.0003, - "reward": 0.7083333432674408, - "reward_std": 0.1451837718486786, - "rewards/correct_code_reward_func": 0.2083333432674408, + "loss": 0.0016, + "reward": 0.8125, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.5, "step": 426 }, { - "completion_length": 64.83333587646484, - "epoch": 6.832, - "grad_norm": 0.5655733417160421, - "kl": 0.205078125, + "completion_length": 37.45833396911621, + "epoch": 6.784, + "grad_norm": 9.274067601581377, + "kl": 1.51171875, "learning_rate": 2.7432456460553975e-08, - "loss": 0.0002, - "reward": 0.8388713598251343, - "reward_std": 0.2186070904135704, - "rewards/correct_code_reward_func": 0.354166679084301, - "rewards/len_reward_func": 0.4847046583890915, + "loss": 0.0015, + "reward": 0.6875000298023224, + "reward_std": 0.2931964099407196, + "rewards/correct_code_reward_func": 0.1875000074505806, + "rewards/len_reward_func": 0.5, "step": 427 }, { - "completion_length": 69.8125, - "epoch": 6.848, - "grad_norm": 0.8158828919422655, - "kl": 0.26025390625, + "completion_length": 22.89583396911621, + "epoch": 6.8, + "grad_norm": 2.8256870547748227, + "kl": 1.974609375, "learning_rate": 2.6699613194407723e-08, - "loss": 0.0003, - "reward": 1.0208333432674408, - "reward_std": 0.2903806120157242, - "rewards/correct_code_reward_func": 0.541666679084301, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.002, + "reward": 0.8541666865348816, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.3541666716337204, + "rewards/len_reward_func": 0.5, "step": 428 }, { - "completion_length": 57.68750190734863, - "epoch": 6.864, - "grad_norm": 0.8089968840061872, - "kl": 0.509765625, + "completion_length": 57.895835876464844, + "epoch": 6.816, + "grad_norm": 2.0942452217824026, + "kl": 0.22802734375, "learning_rate": 2.5976139165266364e-08, - "loss": 0.0005, - "reward": 0.8247024118900299, - "reward_std": 0.34987257421016693, - "rewards/correct_code_reward_func": 0.3750000149011612, - "rewards/len_reward_func": 0.4497023820877075, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.22233544290065765, + "rewards/correct_code_reward_func": 0.25, + "rewards/len_reward_func": 0.5, "step": 429 }, { - "completion_length": 39.39583396911621, - "epoch": 6.88, - "grad_norm": 1.0217738344460119, - "kl": 0.5576171875, + "completion_length": 41.645835876464844, + "epoch": 6.832, + "grad_norm": 3.326484576610779, + "kl": 0.25634765625, "learning_rate": 2.5262064728651194e-08, - "loss": 0.0006, - "reward": 1.0833333730697632, - "reward_std": 0.2357022576034069, - "rewards/correct_code_reward_func": 0.6041666865348816, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0003, + "reward": 0.9791666865348816, + "reward_std": 0.16340987384319305, + "rewards/correct_code_reward_func": 0.4791666716337204, + "rewards/len_reward_func": 0.5, "step": 430 }, { - "completion_length": 100.5625, - "epoch": 6.896, - "grad_norm": 0.4872455508251309, - "kl": 0.19775390625, + "completion_length": 38.41666793823242, + "epoch": 6.848, + "grad_norm": 0.5971645235217127, + "kl": 0.2880859375, "learning_rate": 2.4557419845695427e-08, - "loss": 0.0002, - "reward": 0.7269951701164246, - "reward_std": 0.09056825935840607, - "rewards/correct_code_reward_func": 0.2291666716337204, - "rewards/len_reward_func": 0.49782851338386536, + "loss": 0.0003, + "reward": 1.1041667461395264, + "reward_std": 0.2041093111038208, + "rewards/correct_code_reward_func": 0.6250000298023224, + "rewards/len_reward_func": 0.4791666716337204, "step": 431 }, { - "completion_length": 53.937503814697266, - "epoch": 6.912, - "grad_norm": 1.0082713173583926, - "kl": 0.2470703125, + "completion_length": 32.20833396911621, + "epoch": 6.864, + "grad_norm": 3.312252032659254, + "kl": 0.583984375, "learning_rate": 2.3862234081887033e-08, - "loss": 0.0002, - "reward": 0.9166666865348816, - "reward_std": 0.2840898856520653, - "rewards/correct_code_reward_func": 0.4166666679084301, + "loss": 0.0006, + "reward": 1.0416666865348816, + "reward_std": 0.3177001625299454, + "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 432 }, { - "completion_length": 82.89583396911621, - "epoch": 6.928, - "grad_norm": 0.6085631893861287, - "kl": 1.14990234375, + "completion_length": 24.812500953674316, + "epoch": 6.88, + "grad_norm": 1.3331142703147087, + "kl": 0.7646484375, "learning_rate": 2.3176536605828438e-08, - "loss": 0.0012, - "reward": 0.8896019458770752, - "reward_std": 0.21334683522582054, - "rewards/correct_code_reward_func": 0.4166666716337204, - "rewards/len_reward_func": 0.472935289144516, + "loss": 0.0008, + "reward": 1.2083333730697632, + "reward_std": 0.16623875498771667, + "rewards/correct_code_reward_func": 0.7083333432674408, + "rewards/len_reward_func": 0.5, "step": 433 }, { - "completion_length": 44.104166984558105, - "epoch": 6.944, - "grad_norm": 1.0786484721894714, - "kl": 0.3359375, + "completion_length": 88.10416793823242, + "epoch": 6.896, + "grad_norm": 5.916709269663577, + "kl": 1.01611328125, "learning_rate": 2.250035618801241e-08, - "loss": 0.0003, - "reward": 1.0625000298023224, - "reward_std": 0.37034809589385986, - "rewards/correct_code_reward_func": 0.5625000298023224, - "rewards/len_reward_func": 0.5, + "loss": 0.001, + "reward": 0.812254399061203, + "reward_std": 0.13677212223410606, + "rewards/correct_code_reward_func": 0.31250002048909664, + "rewards/len_reward_func": 0.4997543394565582, "step": 434 }, { - "completion_length": 67.93750381469727, - "epoch": 6.96, - "grad_norm": 0.3880109584138995, - "kl": 0.33935546875, + "completion_length": 42.625, + "epoch": 6.912, + "grad_norm": 8.385217980847418, + "kl": 1.11181640625, "learning_rate": 2.183372119961499e-08, - "loss": 0.0003, - "reward": 0.8750000596046448, - "reward_std": 0.16573724150657654, - "rewards/correct_code_reward_func": 0.3958333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0011, + "reward": 1.041666716337204, + "reward_std": 0.3731769770383835, + "rewards/correct_code_reward_func": 0.5416666865348816, + "rewards/len_reward_func": 0.5, "step": 435 }, { - "completion_length": 56.812503814697266, - "epoch": 6.976, - "grad_norm": 0.9901452341472597, - "kl": 0.33203125, + "completion_length": 56.395835876464844, + "epoch": 6.928, + "grad_norm": 10.216799500012975, + "kl": 0.5478515625, "learning_rate": 2.117665961130513e-08, - "loss": 0.0003, - "reward": 0.7500000298023224, - "reward_std": 0.3233579099178314, - "rewards/correct_code_reward_func": 0.2500000149011612, + "loss": 0.0005, + "reward": 0.9375, + "reward_std": 0.28126100823283195, + "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.5, "step": 436 }, { - "completion_length": 58.187503814697266, - "epoch": 6.992, - "grad_norm": 11.29873817529736, - "kl": 6.416015625, + "completion_length": 24.70833396911621, + "epoch": 6.944, + "grad_norm": 16.1399495650957, + "kl": 0.4716796875, "learning_rate": 2.05291989920712e-08, - "loss": 0.0064, - "reward": 1.0416667461395264, - "reward_std": 0.2721545100212097, - "rewards/correct_code_reward_func": 0.5416666865348816, + "loss": 0.0005, + "reward": 1.1458333730697632, + "reward_std": 0.3142514228820801, + "rewards/correct_code_reward_func": 0.645833358168602, "rewards/len_reward_func": 0.5, "step": 437 }, { - "completion_length": 138.0, - "epoch": 7.0, - "grad_norm": 11.29873817529736, - "kl": 0.185546875, + "completion_length": 31.937501907348633, + "epoch": 6.96, + "grad_norm": 3.526111590197331, + "kl": 0.62890625, "learning_rate": 1.9891366508064e-08, - "loss": 0.0001, - "reward": 0.8688524961471558, - "reward_std": 0.13523900508880615, - "rewards/correct_code_reward_func": 0.375, - "rewards/len_reward_func": 0.4938524663448334, + "loss": 0.0006, + "reward": 1.1041667461395264, + "reward_std": 0.23144195973873138, + "rewards/correct_code_reward_func": 0.6041666865348816, + "rewards/len_reward_func": 0.5, "step": 438 }, { - "completion_length": 52.43750190734863, - "epoch": 7.016, - "grad_norm": 0.42260210721584784, - "kl": 0.13671875, + "completion_length": 30.89583396911621, + "epoch": 6.976, + "grad_norm": 2.7684451481472845, + "kl": 1.005859375, "learning_rate": 1.926318892145712e-08, - "loss": 0.0001, - "reward": 1.0208333432674408, - "reward_std": 0.1767766959965229, - "rewards/correct_code_reward_func": 0.5208333432674408, - "rewards/len_reward_func": 0.5, + "loss": 0.001, + "reward": 0.8120748400688171, + "reward_std": 0.34204548597335815, + "rewards/correct_code_reward_func": 0.3125, + "rewards/len_reward_func": 0.49957482516765594, "step": 439 }, { - "completion_length": 37.54166793823242, - "epoch": 7.032, - "grad_norm": 4.960698796552231, - "kl": 5.15625, + "completion_length": 28.625000953674316, + "epoch": 6.992, + "grad_norm": 10.488372974591355, + "kl": 3.521484375, "learning_rate": 1.8644692589323967e-08, - "loss": 0.0052, - "reward": 0.9583333432674408, - "reward_std": 0.2357022576034069, - "rewards/correct_code_reward_func": 0.4791666716337204, + "loss": 0.0035, + "reward": 1.2291666865348816, + "reward_std": 0.37177951633930206, + "rewards/correct_code_reward_func": 0.7500000298023224, "rewards/len_reward_func": 0.4791666716337204, "step": 440 }, { - "completion_length": 33.64583492279053, - "epoch": 7.048, - "grad_norm": 2.6085116118275984, - "kl": 10.71875, + "completion_length": 50.79166793823242, + "epoch": 7.0, + "grad_norm": 10.488372974591355, + "kl": 0.859375, "learning_rate": 1.803590346253195e-08, - "loss": 0.0107, + "loss": 0.0004, "reward": 0.875, - "reward_std": 0.16623876243829727, + "reward_std": 0.1178511306643486, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 441 }, { - "completion_length": 89.0, - "epoch": 7.064, - "grad_norm": 1.859877923727488, - "kl": 0.63623046875, + "completion_length": 36.10416793823242, + "epoch": 7.016, + "grad_norm": 14.864715045495563, + "kl": 0.283203125, "learning_rate": 1.7436847084653456e-08, - "loss": 0.0006, - "reward": 0.5596473067998886, - "reward_std": 0.09432684071362019, - "rewards/correct_code_reward_func": 0.0625, - "rewards/len_reward_func": 0.4971473067998886, + "loss": 0.0003, + "reward": 1.0625, + "reward_std": 0.2041093371808529, + "rewards/correct_code_reward_func": 0.5625, + "rewards/len_reward_func": 0.5, "step": 442 }, { - "completion_length": 51.5625, - "epoch": 7.08, - "grad_norm": 0.5005441787583624, - "kl": 0.373046875, + "completion_length": 24.4375, + "epoch": 7.032, + "grad_norm": 0.9322362358127333, + "kl": 1.41796875, "learning_rate": 1.6847548590894434e-08, - "loss": 0.0004, - "reward": 0.7394323945045471, - "reward_std": 0.30204425752162933, - "rewards/correct_code_reward_func": 0.2500000074505806, - "rewards/len_reward_func": 0.4894323796033859, + "loss": 0.0014, + "reward": 1.0000000596046448, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.5000000149011612, + "rewards/len_reward_func": 0.5, "step": 443 }, { - "completion_length": 76.58333587646484, - "epoch": 7.096, - "grad_norm": 0.9287668036145749, - "kl": 8.8232421875, - "learning_rate": 1.626803270703936e-08, - "loss": 0.0089, - "reward": 0.5625000298023224, - "reward_std": 0.22516432031989098, - "rewards/correct_code_reward_func": 0.08333333395421505, - "rewards/len_reward_func": 0.4791666716337204, + "completion_length": 20.229166984558105, + "epoch": 7.048, + "grad_norm": 8.65798400301399, + "kl": 0.55859375, + "learning_rate": 1.626803270703936e-08, + "loss": 0.0006, + "reward": 0.9583333730697632, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.458333358168602, + "rewards/len_reward_func": 0.5, "step": 444 }, { - "completion_length": 54.375, - "epoch": 7.112, - "grad_norm": 2.3736535380234596, - "kl": 2.140625, + "completion_length": 61.33333396911621, + "epoch": 7.064, + "grad_norm": 0.38359849574872695, + "kl": 0.626953125, "learning_rate": 1.5698323748414122e-08, - "loss": 0.0021, - "reward": 0.7083333432674408, - "reward_std": 0.2616034671664238, - "rewards/correct_code_reward_func": 0.2083333432674408, + "loss": 0.0006, + "reward": 0.6250000298023224, + "reward_std": 0.22233545035123825, + "rewards/correct_code_reward_func": 0.1250000037252903, "rewards/len_reward_func": 0.5, "step": 445 }, { - "completion_length": 48.16666793823242, - "epoch": 7.128, - "grad_norm": 0.26492340759427296, - "kl": 2.6064453125, + "completion_length": 28.5625, + "epoch": 7.08, + "grad_norm": 1.1673173612412489, + "kl": 0.90625, "learning_rate": 1.513844561886554e-08, - "loss": 0.0026, - "reward": 0.8541666865348816, - "reward_std": 0.16340987384319305, - "rewards/correct_code_reward_func": 0.3541666865348816, + "loss": 0.0009, + "reward": 1.0416666865348816, + "reward_std": 0.2069382146000862, + "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 446 }, { - "completion_length": 55.43750190734863, - "epoch": 7.144, - "grad_norm": 2.9991534421290407, - "kl": 1.095703125, + "completion_length": 61.833335876464844, + "epoch": 7.096, + "grad_norm": 16.04386512848503, + "kl": 0.255859375, "learning_rate": 1.4588421809758639e-08, - "loss": 0.0011, - "reward": 0.9375000298023224, - "reward_std": 0.25199542567133904, - "rewards/correct_code_reward_func": 0.4583333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0003, + "reward": 0.8125000298023224, + "reward_std": 0.16340987384319305, + "rewards/correct_code_reward_func": 0.3125000149011612, + "rewards/len_reward_func": 0.5, "step": 447 }, { - "completion_length": 60.354169845581055, - "epoch": 7.16, - "grad_norm": 1.6852652762873734, - "kl": 5.42578125, + "completion_length": 36.75, + "epoch": 7.112, + "grad_norm": 2.1279061887438573, + "kl": 0.28515625, "learning_rate": 1.4048275398990894e-08, - "loss": 0.0054, - "reward": 0.75, - "reward_std": 0.08908708393573761, - "rewards/correct_code_reward_func": 0.25, + "loss": 0.0003, + "reward": 0.7916666865348816, + "reward_std": 0.2553258389234543, + "rewards/correct_code_reward_func": 0.291666679084301, "rewards/len_reward_func": 0.5, "step": 448 }, { - "completion_length": 66.70833587646484, - "epoch": 7.176, - "grad_norm": 0.038892328040448086, - "kl": 0.13330078125, + "completion_length": 27.229166984558105, + "epoch": 7.128, + "grad_norm": 1.2991397133477103, + "kl": 0.34765625, "learning_rate": 1.351802905002386e-08, - "loss": 0.0001, - "reward": 0.7500000298023224, - "reward_std": 0.08908708393573761, - "rewards/correct_code_reward_func": 0.2500000074505806, + "loss": 0.0003, + "reward": 0.875, + "reward_std": 0.07715167850255966, + "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.5, "step": 449 }, { - "completion_length": 75.4375, - "epoch": 7.192, - "grad_norm": 0.5237816703450692, - "kl": 0.25048828125, + "completion_length": 33.83333396911621, + "epoch": 7.144, + "grad_norm": 0.8457590693731686, + "kl": 0.521484375, "learning_rate": 1.2997705010932391e-08, - "loss": 0.0002, - "reward": 0.979166716337204, - "reward_std": 0.28126102685928345, - "rewards/correct_code_reward_func": 0.5000000298023224, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0005, + "reward": 0.9375000596046448, + "reward_std": 0.24056154489517212, + "rewards/correct_code_reward_func": 0.4375000149011612, + "rewards/len_reward_func": 0.5, "step": 450 }, { - "completion_length": 71.72916793823242, - "epoch": 7.208, - "grad_norm": 0.5803732522850783, - "kl": 0.421875, + "completion_length": 48.95833396911621, + "epoch": 7.16, + "grad_norm": 43.218649853018356, + "kl": 9.37255859375, "learning_rate": 1.248732511347103e-08, - "loss": 0.0004, - "reward": 0.6041666865348816, - "reward_std": 0.2041093111038208, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0093, + "reward": 0.7708333730697632, + "reward_std": 0.22516431659460068, + "rewards/correct_code_reward_func": 0.2708333395421505, + "rewards/len_reward_func": 0.5, "step": 451 }, { - "completion_length": 42.04166793823242, - "epoch": 7.224, - "grad_norm": 0.5184725145288659, - "kl": 0.21435546875, + "completion_length": 38.56250190734863, + "epoch": 7.176, + "grad_norm": 1.3972703569580567, + "kl": 0.703125, "learning_rate": 1.1986910772158105e-08, - "loss": 0.0002, - "reward": 1.1666666865348816, - "reward_std": 0.19500280916690826, - "rewards/correct_code_reward_func": 0.6666666865348816, + "loss": 0.0007, + "reward": 1.0000000596046448, + "reward_std": 0.2342708334326744, + "rewards/correct_code_reward_func": 0.5000000149011612, "rewards/len_reward_func": 0.5, "step": 452 }, { - "completion_length": 56.22916793823242, - "epoch": 7.24, - "grad_norm": 0.748753589898722, - "kl": 3.5458984375, + "completion_length": 51.95833396911621, + "epoch": 7.192, + "grad_norm": 1.115685314374603, + "kl": 0.72900390625, "learning_rate": 1.1496482983377188e-08, - "loss": 0.0036, - "reward": 0.7708333432674408, - "reward_std": 0.23144196718931198, - "rewards/correct_code_reward_func": 0.2916666716337204, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0007, + "reward": 1.0236400961875916, + "reward_std": 0.27332228422164917, + "rewards/correct_code_reward_func": 0.5416666865348816, + "rewards/len_reward_func": 0.48197343945503235, "step": 453 }, { - "completion_length": 51.145835876464844, - "epoch": 7.256, - "grad_norm": 0.8694366869498387, - "kl": 0.318359375, + "completion_length": 42.41666793823242, + "epoch": 7.208, + "grad_norm": 8.181481838866919, + "kl": 1.8310546875, "learning_rate": 1.1016062324496007e-08, - "loss": 0.0003, - "reward": 1.0625, - "reward_std": 0.1480126492679119, - "rewards/correct_code_reward_func": 0.5625, + "loss": 0.0018, + "reward": 0.7291666865348816, + "reward_std": 0.28126102685928345, + "rewards/correct_code_reward_func": 0.229166679084301, "rewards/len_reward_func": 0.5, "step": 454 }, { - "completion_length": 56.625, - "epoch": 7.272, - "grad_norm": 0.702230593022222, - "kl": 0.21240234375, + "completion_length": 31.687500953674316, + "epoch": 7.224, + "grad_norm": 3.5471143654468107, + "kl": 0.75, "learning_rate": 1.054566895300324e-08, - "loss": 0.0002, - "reward": 1.0208333730697632, - "reward_std": 0.293196402490139, - "rewards/correct_code_reward_func": 0.5208333432674408, + "loss": 0.0008, + "reward": 1.1875, + "reward_std": 0.13607724383473396, + "rewards/correct_code_reward_func": 0.6875, "rewards/len_reward_func": 0.5, "step": 455 }, { - "completion_length": 66.06250381469727, - "epoch": 7.288, - "grad_norm": 0.9464997647511368, - "kl": 0.251953125, + "completion_length": 31.750001907348633, + "epoch": 7.24, + "grad_norm": 3.358209387145501, + "kl": 0.62060546875, "learning_rate": 1.0085322605662666e-08, - "loss": 0.0003, - "reward": 0.875, - "reward_std": 0.34018656611442566, - "rewards/correct_code_reward_func": 0.3958333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0006, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.1666666716337204, + "rewards/len_reward_func": 0.5, "step": 456 }, { - "completion_length": 74.52083587646484, - "epoch": 7.304, - "grad_norm": 0.7564146282258873, - "kl": 0.375, + "completion_length": 36.50000190734863, + "epoch": 7.256, + "grad_norm": 1.3699979933630457, + "kl": 0.3955078125, "learning_rate": 9.635042597685023e-09, "loss": 0.0004, - "reward": 0.7500000298023224, - "reward_std": 0.3247893303632736, - "rewards/correct_code_reward_func": 0.2708333395421505, - "rewards/len_reward_func": 0.4791666716337204, + "reward": 1.0208333730697632, + "reward_std": 0.23144196718931198, + "rewards/correct_code_reward_func": 0.5208333432674408, + "rewards/len_reward_func": 0.5, "step": 457 }, { - "completion_length": 39.68750190734863, - "epoch": 7.32, - "grad_norm": 1.2239732790190176, - "kl": 7.734375, + "completion_length": 45.4375, + "epoch": 7.272, + "grad_norm": 12.90766174002275, + "kl": 0.28125, "learning_rate": 9.194847821917623e-09, - "loss": 0.0077, - "reward": 0.6666666716337204, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.1875, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.2994871288537979, + "rewards/correct_code_reward_func": 0.5, + "rewards/len_reward_func": 0.5, "step": 458 }, { - "completion_length": 43.77083396911621, - "epoch": 7.336, - "grad_norm": 1.0556059045351955, - "kl": 2.236328125, + "completion_length": 42.291666984558105, + "epoch": 7.288, + "grad_norm": 1.7345212956082203, + "kl": 0.2353515625, "learning_rate": 8.764756748051661e-09, - "loss": 0.0022, - "reward": 0.9615983366966248, - "reward_std": 0.12054916843771935, - "rewards/correct_code_reward_func": 0.520833358168602, - "rewards/len_reward_func": 0.4407649636268616, + "loss": 0.0002, + "reward": 0.979166716337204, + "reward_std": 0.23709972202777863, + "rewards/correct_code_reward_func": 0.479166679084301, + "rewards/len_reward_func": 0.5, "step": 459 }, { - "completion_length": 37.77083396911621, - "epoch": 7.352, - "grad_norm": 1.6276321872616368, - "kl": 2.4609375, + "completion_length": 44.64583396911621, + "epoch": 7.304, + "grad_norm": 0.8074298440188371, + "kl": 0.59375, "learning_rate": 8.344787421847216e-09, - "loss": 0.0025, - "reward": 0.9375000298023224, - "reward_std": 0.37662573158741, - "rewards/correct_code_reward_func": 0.4583333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "loss": 0.0006, + "reward": 0.9375000596046448, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.4375000149011612, + "rewards/len_reward_func": 0.5, "step": 460 }, { - "completion_length": 52.020835876464844, - "epoch": 7.368, - "grad_norm": 1.186981092719814, - "kl": 0.255859375, + "completion_length": 31.375000953674316, + "epoch": 7.32, + "grad_norm": 1.0606388187901021, + "kl": 0.435546875, "learning_rate": 7.934957464376058e-09, - "loss": 0.0003, - "reward": 0.9375000596046448, - "reward_std": 0.24056155234575272, - "rewards/correct_code_reward_func": 0.4375000149011612, + "loss": 0.0004, + "reward": 0.6458333432674408, + "reward_std": 0.0589255653321743, + "rewards/correct_code_reward_func": 0.1458333432674408, "rewards/len_reward_func": 0.5, "step": 461 }, { - "completion_length": 67.20833396911621, - "epoch": 7.384, - "grad_norm": 0.8382586656398067, - "kl": 2.98828125, + "completion_length": 25.000000953674316, + "epoch": 7.336, + "grad_norm": 1.0647975039517257, + "kl": 0.4501953125, "learning_rate": 7.535284071282455e-09, - "loss": 0.003, - "reward": 0.8958333432674408, - "reward_std": 0.3794676810503006, - "rewards/correct_code_reward_func": 0.3958333432674408, + "loss": 0.0005, + "reward": 1.0416666865348816, + "reward_std": 0.1451837718486786, + "rewards/correct_code_reward_func": 0.5416666716337204, "rewards/len_reward_func": 0.5, "step": 462 }, { - "completion_length": 56.958335876464844, - "epoch": 7.4, - "grad_norm": 0.9303999876294363, - "kl": 1.94140625, + "completion_length": 21.750000953674316, + "epoch": 7.352, + "grad_norm": 0.5043304022456115, + "kl": 0.34375, "learning_rate": 7.145784012061423e-09, - "loss": 0.002, - "reward": 0.6666666865348816, - "reward_std": 0.1178511306643486, - "rewards/correct_code_reward_func": 0.1666666716337204, + "loss": 0.0003, + "reward": 0.9583333730697632, + "reward_std": 0.1451837718486786, + "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 463 }, { - "completion_length": 67.04166793823242, - "epoch": 7.416, - "grad_norm": 0.840509368946007, - "kl": 0.16650390625, + "completion_length": 40.75, + "epoch": 7.368, + "grad_norm": 68.80649583724717, + "kl": 54.8515625, "learning_rate": 6.766473629355452e-09, - "loss": 0.0002, - "reward": 0.5841049551963806, - "reward_std": 0.19282037019729614, - "rewards/correct_code_reward_func": 0.125, - "rewards/len_reward_func": 0.4591049402952194, + "loss": 0.0548, + "reward": 0.9166666865348816, + "reward_std": 0.2840898931026459, + "rewards/correct_code_reward_func": 0.4166666716337204, + "rewards/len_reward_func": 0.5, "step": 464 }, { - "completion_length": 66.39583396911621, - "epoch": 7.432, - "grad_norm": 0.025722166274313427, - "kl": 0.328125, + "completion_length": 53.83333396911621, + "epoch": 7.384, + "grad_norm": 6.320430427612535, + "kl": 0.2294921875, "learning_rate": 6.397368838268496e-09, - "loss": 0.0003, - "reward": 0.5833333432674408, - "reward_std": 0.08908708393573761, - "rewards/correct_code_reward_func": 0.0833333358168602, + "loss": 0.0002, + "reward": 0.9583333730697632, + "reward_std": 0.4446708858013153, + "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 465 }, { - "completion_length": 57.645835876464844, - "epoch": 7.448, - "grad_norm": 0.6577307584519412, - "kl": 0.3203125, + "completion_length": 53.56250190734863, + "epoch": 7.4, + "grad_norm": 1.270276850939279, + "kl": 0.21435546875, "learning_rate": 6.038485125698295e-09, - "loss": 0.0003, - "reward": 0.7435699999332428, - "reward_std": 0.27804213389754295, - "rewards/correct_code_reward_func": 0.2500000149011612, - "rewards/len_reward_func": 0.4935699701309204, + "loss": 0.0002, + "reward": 0.6875000298023224, + "reward_std": 0.1767766959965229, + "rewards/correct_code_reward_func": 0.1875000111758709, + "rewards/len_reward_func": 0.5, "step": 466 }, { - "completion_length": 48.66666793823242, - "epoch": 7.464, - "grad_norm": 0.9138293929656933, - "kl": 3.8828125, + "completion_length": 48.18750190734863, + "epoch": 7.416, + "grad_norm": 5.003487718170664, + "kl": 1.1865234375, "learning_rate": 5.689837549686744e-09, - "loss": 0.0039, - "reward": 0.8750000298023224, - "reward_std": 0.2630348764359951, - "rewards/correct_code_reward_func": 0.3750000149011612, + "loss": 0.0012, + "reward": 0.8333333432674408, + "reward_std": 0.27215447276830673, + "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 467 }, { - "completion_length": 59.062503814697266, - "epoch": 7.48, - "grad_norm": 0.7099051490378047, - "kl": 0.8779296875, + "completion_length": 39.583335876464844, + "epoch": 7.432, + "grad_norm": 2.735415228101247, + "kl": 0.3740234375, "learning_rate": 5.3514407387877936e-09, - "loss": 0.0009, - "reward": 0.6666666865348816, - "reward_std": 0.17817416787147522, - "rewards/correct_code_reward_func": 0.1666666716337204, + "loss": 0.0004, + "reward": 0.5833333432674408, + "reward_std": 0.1451837718486786, + "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.5, "step": 468 }, { - "completion_length": 54.91666793823242, - "epoch": 7.496, - "grad_norm": 0.45864083120283305, - "kl": 0.118896484375, + "completion_length": 37.31250190734863, + "epoch": 7.448, + "grad_norm": 11.016089026286402, + "kl": 0.24560546875, "learning_rate": 5.023308891453915e-09, - "loss": 0.0001, - "reward": 1.1250000596046448, - "reward_std": 0.2553258463740349, - "rewards/correct_code_reward_func": 0.6250000298023224, + "loss": 0.0002, + "reward": 0.8750000298023224, + "reward_std": 0.2840898856520653, + "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.5, "step": 469 }, { - "completion_length": 45.08333396911621, - "epoch": 7.5120000000000005, - "grad_norm": 4.937237802753599, - "kl": 17.046875, + "completion_length": 25.89583396911621, + "epoch": 7.464, + "grad_norm": 3.2930129644698196, + "kl": 0.3974609375, "learning_rate": 4.705455775440237e-09, - "loss": 0.0171, - "reward": 1.104166716337204, - "reward_std": 0.4355643689632416, - "rewards/correct_code_reward_func": 0.6041666865348816, + "loss": 0.0004, + "reward": 0.9791666865348816, + "reward_std": 0.13607724383473396, + "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.5, "step": 470 }, { - "completion_length": 44.27083396911621, - "epoch": 7.5280000000000005, - "grad_norm": 7.29874215241321, - "kl": 0.36279296875, + "completion_length": 37.70833396911621, + "epoch": 7.48, + "grad_norm": 2.9194234148211318, + "kl": 3.8671875, "learning_rate": 4.3978947272269305e-09, - "loss": 0.0004, - "reward": 0.6250000149011612, - "reward_std": 0.1178511306643486, + "loss": 0.0039, + "reward": 0.6458333432674408, + "reward_std": 0.1753452718257904, "rewards/correct_code_reward_func": 0.1458333432674408, - "rewards/len_reward_func": 0.4791666716337204, + "rewards/len_reward_func": 0.5, "step": 471 }, { - "completion_length": 53.37500190734863, - "epoch": 7.5440000000000005, - "grad_norm": 0.8821066647564397, - "kl": 0.98828125, + "completion_length": 36.16666793823242, + "epoch": 7.496, + "grad_norm": 6.793516750616495, + "kl": 12.18359375, "learning_rate": 4.100638651459542e-09, - "loss": 0.001, - "reward": 0.9166666865348816, - "reward_std": 0.3794546127319336, - "rewards/correct_code_reward_func": 0.4166666865348816, - "rewards/len_reward_func": 0.5, + "loss": 0.0122, + "reward": 1.0625000596046448, + "reward_std": 0.39768069982528687, + "rewards/correct_code_reward_func": 0.5833333432674408, + "rewards/len_reward_func": 0.4791666716337204, "step": 472 }, { - "completion_length": 51.56250190734863, - "epoch": 7.5600000000000005, - "grad_norm": 0.9015227034769271, - "kl": 7.67333984375, + "completion_length": 33.33333492279053, + "epoch": 7.5120000000000005, + "grad_norm": 3.5785167861657876, + "kl": 0.4638671875, "learning_rate": 3.813700020407706e-09, - "loss": 0.0077, - "reward": 1.1458333730697632, - "reward_std": 0.13607724383473396, - "rewards/correct_code_reward_func": 0.6458333730697632, + "loss": 0.0005, + "reward": 1.2500000596046448, + "reward_std": 0.22233543917536736, + "rewards/correct_code_reward_func": 0.7500000298023224, "rewards/len_reward_func": 0.5, "step": 473 }, { - "completion_length": 47.520835876464844, - "epoch": 7.576, - "grad_norm": 2.068571985751675, - "kl": 0.515625, + "completion_length": 27.229167938232422, + "epoch": 7.5280000000000005, + "grad_norm": 10.521524045166144, + "kl": 4.5869140625, "learning_rate": 3.5370908734417006e-09, - "loss": 0.0005, - "reward": 0.7916666865348816, - "reward_std": 0.20693820714950562, - "rewards/correct_code_reward_func": 0.2916666716337204, + "loss": 0.0046, + "reward": 0.6041666865348816, + "reward_std": 0.08625819534063339, + "rewards/correct_code_reward_func": 0.1041666716337204, "rewards/len_reward_func": 0.5, "step": 474 }, { - "completion_length": 76.27083587646484, - "epoch": 7.592, - "grad_norm": 0.5263954461256287, - "kl": 0.19677734375, + "completion_length": 35.45833396911621, + "epoch": 7.5440000000000005, + "grad_norm": 3.8203641546567724, + "kl": 3.22265625, "learning_rate": 3.2708228165273244e-09, - "loss": 0.0002, - "reward": 0.9898040592670441, - "reward_std": 0.204023327678442, - "rewards/correct_code_reward_func": 0.5625000223517418, - "rewards/len_reward_func": 0.4273040294647217, + "loss": 0.0032, + "reward": 0.9583333432674408, + "reward_std": 0.2903675436973572, + "rewards/correct_code_reward_func": 0.4583333358168602, + "rewards/len_reward_func": 0.5, "step": 475 }, { - "completion_length": 52.04166793823242, - "epoch": 7.608, - "grad_norm": 0.5681659680222745, - "kl": 0.310546875, + "completion_length": 30.14583396911621, + "epoch": 7.5600000000000005, + "grad_norm": 2.1270667018702687, + "kl": 0.291015625, "learning_rate": 3.0149070217390106e-09, "loss": 0.0003, - "reward": 0.6458333730697632, - "reward_std": 0.22516431659460068, - "rewards/correct_code_reward_func": 0.1458333395421505, + "reward": 1.3333333730697632, + "reward_std": 0.1178511306643486, + "rewards/correct_code_reward_func": 0.8333333730697632, "rewards/len_reward_func": 0.5, "step": 476 }, { - "completion_length": 73.60416793823242, - "epoch": 7.624, - "grad_norm": 0.6028375603980457, - "kl": 0.2900390625, + "completion_length": 30.27083396911621, + "epoch": 7.576, + "grad_norm": 3.9308692428339946, + "kl": 0.375, "learning_rate": 2.769354226790893e-09, - "loss": 0.0003, - "reward": 0.7500000298023224, - "reward_std": 0.22233543917536736, - "rewards/correct_code_reward_func": 0.25000000558793545, + "loss": 0.0004, + "reward": 0.7708333730697632, + "reward_std": 0.21322892606258392, + "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.5, "step": 477 }, { - "completion_length": 61.562503814697266, - "epoch": 7.64, - "grad_norm": 0.5808381625924309, - "kl": 0.4326171875, + "completion_length": 55.45833396911621, + "epoch": 7.592, + "grad_norm": 2.256041899218313, + "kl": 0.3427734375, "learning_rate": 2.5341747345865026e-09, - "loss": 0.0004, - "reward": 0.9751572608947754, - "reward_std": 0.37299972772598267, - "rewards/correct_code_reward_func": 0.4791666716337204, - "rewards/len_reward_func": 0.4959905594587326, + "loss": 0.0003, + "reward": 1.0411083102226257, + "reward_std": 0.20673340186476707, + "rewards/correct_code_reward_func": 0.5625000223517418, + "rewards/len_reward_func": 0.47860829532146454, "step": 478 }, { - "completion_length": 84.64583587646484, - "epoch": 7.656, - "grad_norm": 0.6300673976043227, - "kl": 0.1806640625, + "completion_length": 34.5625, + "epoch": 7.608, + "grad_norm": 4.963557661177748, + "kl": 1.2607421875, "learning_rate": 2.3093784127863057e-09, - "loss": 0.0002, - "reward": 0.8736242055892944, - "reward_std": 0.23959357291460037, - "rewards/correct_code_reward_func": 0.3958333432674408, - "rewards/len_reward_func": 0.47779087722301483, + "loss": 0.0013, + "reward": 0.8541666865348816, + "reward_std": 0.30859363824129105, + "rewards/correct_code_reward_func": 0.3541666679084301, + "rewards/len_reward_func": 0.5, "step": 479 }, { - "completion_length": 92.91667175292969, - "epoch": 7.672, - "grad_norm": 0.5651987723508025, - "kl": 0.404296875, + "completion_length": 43.395835876464844, + "epoch": 7.624, + "grad_norm": 0.9036937312542045, + "kl": 2.1318359375, "learning_rate": 2.094974693393731e-09, - "loss": 0.0004, - "reward": 0.8541666865348816, - "reward_std": 0.28126098960638046, - "rewards/correct_code_reward_func": 0.3541666716337204, + "loss": 0.0021, + "reward": 0.75, + "reward_std": 0.08908708393573761, + "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.5, "step": 480 }, { - "completion_length": 85.39583587646484, - "epoch": 7.688, - "grad_norm": 0.6295561710360636, - "kl": 0.212890625, + "completion_length": 42.18750190734863, + "epoch": 7.64, + "grad_norm": 7.599576735747625, + "kl": 0.255859375, "learning_rate": 1.890972572359456e-09, - "loss": 0.0002, - "reward": 0.9465548694133759, - "reward_std": 0.23564931005239487, - "rewards/correct_code_reward_func": 0.4583333432674408, - "rewards/len_reward_func": 0.4882214665412903, + "loss": 0.0003, + "reward": 1.1041666865348816, + "reward_std": 0.320529043674469, + "rewards/correct_code_reward_func": 0.6041666865348816, + "rewards/len_reward_func": 0.5, "step": 481 }, { - "completion_length": 47.56250190734863, - "epoch": 7.704, - "grad_norm": 0.7929325837654847, - "kl": 0.26806640625, + "completion_length": 41.062500953674316, + "epoch": 7.656, + "grad_norm": 7.576066045438129, + "kl": 0.95947265625, "learning_rate": 1.6973806092038523e-09, - "loss": 0.0003, - "reward": 1.0208333730697632, - "reward_std": 0.24056155234575272, - "rewards/correct_code_reward_func": 0.520833358168602, + "loss": 0.001, + "reward": 0.8958333432674408, + "reward_std": 0.22516432031989098, + "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.5, "step": 482 }, { - "completion_length": 100.66666984558105, - "epoch": 7.72, - "grad_norm": 1.157303815664519, - "kl": 2.140625, + "completion_length": 66.18750381469727, + "epoch": 7.672, + "grad_norm": 2.1174824529271934, + "kl": 0.44189453125, "learning_rate": 1.514206926658046e-09, - "loss": 0.0021, - "reward": 0.75, - "reward_std": 0.34018657356500626, - "rewards/correct_code_reward_func": 0.29166667722165585, - "rewards/len_reward_func": 0.4583333432674408, + "loss": 0.0004, + "reward": 0.828984946012497, + "reward_std": 0.19452743232250214, + "rewards/correct_code_reward_func": 0.3541666865348816, + "rewards/len_reward_func": 0.47481827437877655, "step": 483 }, { - "completion_length": 43.166666984558105, - "epoch": 7.736, - "grad_norm": 2.1077527041992665, - "kl": 0.484375, + "completion_length": 32.83333396911621, + "epoch": 7.688, + "grad_norm": 5.163246199946358, + "kl": 3.888671875, "learning_rate": 1.3414592103228594e-09, - "loss": 0.0005, - "reward": 0.7291666865348816, - "reward_std": 0.33592625707387924, - "rewards/correct_code_reward_func": 0.2291666679084301, + "loss": 0.0039, + "reward": 1.0833333730697632, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.5, "step": 484 }, { - "completion_length": 49.0625, - "epoch": 7.752, - "grad_norm": 0.9693846669459211, - "kl": 14.25, + "completion_length": 52.87500286102295, + "epoch": 7.704, + "grad_norm": 2.809107387786595, + "kl": 4.5390625, "learning_rate": 1.1791447083465133e-09, - "loss": 0.0143, - "reward": 0.8865291476249695, - "reward_std": 0.26687781512737274, - "rewards/correct_code_reward_func": 0.3958333432674408, - "rewards/len_reward_func": 0.4906958043575287, + "loss": 0.0045, + "reward": 1.0610772967338562, + "reward_std": 0.31889135390520096, + "rewards/correct_code_reward_func": 0.5833333730697632, + "rewards/len_reward_func": 0.4777439534664154, "step": 485 }, { - "completion_length": 51.4375, - "epoch": 7.768, - "grad_norm": 0.8356137768154641, - "kl": 0.173828125, + "completion_length": 67.33333587646484, + "epoch": 7.72, + "grad_norm": 8.986243345931461, + "kl": 9.318359375, "learning_rate": 1.0272702311203695e-09, - "loss": 0.0002, - "reward": 0.7291666865348816, - "reward_std": 0.13607725501060486, - "rewards/correct_code_reward_func": 0.2291666716337204, + "loss": 0.0093, + "reward": 0.9583333730697632, + "reward_std": 0.34018656611442566, + "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 486 }, { - "completion_length": 45.72916793823242, - "epoch": 7.784, - "grad_norm": 0.4088288944119298, - "kl": 0.19970703125, + "completion_length": 29.687501907348633, + "epoch": 7.736, + "grad_norm": 0.7218544621194819, + "kl": 0.833984375, "learning_rate": 8.858421509933823e-10, - "loss": 0.0002, - "reward": 0.8125000298023224, - "reward_std": 0.22516432031989098, - "rewards/correct_code_reward_func": 0.31250002048909664, + "loss": 0.0008, + "reward": 0.7291666865348816, + "reward_std": 0.13607724383473396, + "rewards/correct_code_reward_func": 0.2291666679084301, "rewards/len_reward_func": 0.5, "step": 487 }, { - "completion_length": 102.79167175292969, - "epoch": 7.8, - "grad_norm": 0.433684569229063, - "kl": 1.2890625, + "completion_length": 31.27083396911621, + "epoch": 7.752, + "grad_norm": 7.305486246951896, + "kl": 0.490234375, "learning_rate": 7.548664020045059e-10, - "loss": 0.0013, - "reward": 0.7926969826221466, - "reward_std": 0.251124428701587, - "rewards/correct_code_reward_func": 0.3333333432674408, - "rewards/len_reward_func": 0.4593636244535446, + "loss": 0.0005, + "reward": 0.979166716337204, + "reward_std": 0.21322893351316452, + "rewards/correct_code_reward_func": 0.4791666865348816, + "rewards/len_reward_func": 0.5, "step": 488 }, { - "completion_length": 82.77083587646484, - "epoch": 7.816, - "grad_norm": 0.6059762447804334, - "kl": 0.23095703125, + "completion_length": 33.4375, + "epoch": 7.768, + "grad_norm": 5.374284656554601, + "kl": 0.3330078125, "learning_rate": 6.343484796338394e-10, - "loss": 0.0002, - "reward": 0.8035337924957275, - "reward_std": 0.14454852044582367, - "rewards/correct_code_reward_func": 0.3125000149011612, - "rewards/len_reward_func": 0.49103376269340515, + "loss": 0.0003, + "reward": 0.8125000298023224, + "reward_std": 0.22516432031989098, + "rewards/correct_code_reward_func": 0.31250002048909664, + "rewards/len_reward_func": 0.5, "step": 489 }, { - "completion_length": 54.208335876464844, - "epoch": 7.832, - "grad_norm": 0.8784125468778735, - "kl": 0.4736328125, + "completion_length": 29.937500953674316, + "epoch": 7.784, + "grad_norm": 20.72618168594815, + "kl": 0.4404296875, "learning_rate": 5.242934405720878e-10, - "loss": 0.0005, - "reward": 0.8958333432674408, - "reward_std": 0.37034808099269867, - "rewards/correct_code_reward_func": 0.3958333432674408, + "loss": 0.0004, + "reward": 0.7708333432674408, + "reward_std": 0.28126100823283195, + "rewards/correct_code_reward_func": 0.27083333395421505, "rewards/len_reward_func": 0.5, "step": 490 }, { - "completion_length": 47.52083492279053, - "epoch": 7.848, - "grad_norm": 0.8964848846939538, - "kl": 15.7099609375, + "completion_length": 63.47916793823242, + "epoch": 7.8, + "grad_norm": 7.480626745233278, + "kl": 0.3974609375, "learning_rate": 4.2470590250823223e-10, - "loss": 0.0157, - "reward": 1.0416666865348816, - "reward_std": 0.3205421194434166, - "rewards/correct_code_reward_func": 0.5416666716337204, + "loss": 0.0004, + "reward": 0.8125000298023224, + "reward_std": 0.175345279276371, + "rewards/correct_code_reward_func": 0.3125000074505806, "rewards/len_reward_func": 0.5, "step": 491 }, { - "completion_length": 53.52083396911621, - "epoch": 7.864, - "grad_norm": 1.1605037142060004, - "kl": 0.216796875, + "completion_length": 56.16666793823242, + "epoch": 7.816, + "grad_norm": 4.532566153481945, + "kl": 0.3486328125, "learning_rate": 3.355900439359072e-10, - "loss": 0.0002, - "reward": 0.8333333730697632, - "reward_std": 0.24966806918382645, - "rewards/correct_code_reward_func": 0.3333333432674408, + "loss": 0.0003, + "reward": 0.9583333730697632, + "reward_std": 0.19500282034277916, + "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 492 }, { - "completion_length": 48.47916793823242, - "epoch": 7.88, - "grad_norm": 1.021764440821631, - "kl": 0.4169921875, + "completion_length": 42.8125, + "epoch": 7.832, + "grad_norm": 9.37284729343891, + "kl": 0.2890625, "learning_rate": 2.569496039780683e-10, - "loss": 0.0004, - "reward": 0.9583333432674408, - "reward_std": 0.39485183358192444, - "rewards/correct_code_reward_func": 0.4583333432674408, - "rewards/len_reward_func": 0.5, + "loss": 0.0003, + "reward": 0.8958333432674408, + "reward_std": 0.30231600999832153, + "rewards/correct_code_reward_func": 0.4166666865348816, + "rewards/len_reward_func": 0.4791666716337204, "step": 493 }, { - "completion_length": 47.45833396911621, - "epoch": 7.896, - "grad_norm": 0.8183208039256259, - "kl": 0.31591796875, + "completion_length": 25.291666984558105, + "epoch": 7.848, + "grad_norm": 8.181654775124288, + "kl": 1.865234375, "learning_rate": 1.8878788223009035e-10, - "loss": 0.0003, - "reward": 1.2083333730697632, - "reward_std": 0.3885742127895355, - "rewards/correct_code_reward_func": 0.7083333432674408, + "loss": 0.0019, + "reward": 1.2916667461395264, + "reward_std": 0.24966806918382645, + "rewards/correct_code_reward_func": 0.7916666865348816, "rewards/len_reward_func": 0.5, "step": 494 }, { - "completion_length": 52.85416793823242, - "epoch": 7.912, - "grad_norm": 1.1523367720266866, - "kl": 0.21240234375, + "completion_length": 36.47916793823242, + "epoch": 7.864, + "grad_norm": 2.730688674099727, + "kl": 0.3271484375, "learning_rate": 1.3110773862126667e-10, - "loss": 0.0002, - "reward": 0.7689549326896667, - "reward_std": 0.16004110872745514, - "rewards/correct_code_reward_func": 0.2708333432674408, - "rewards/len_reward_func": 0.49812158942222595, + "loss": 0.0003, + "reward": 0.9583333730697632, + "reward_std": 0.19500280916690826, + "rewards/correct_code_reward_func": 0.4583333432674408, + "rewards/len_reward_func": 0.5, "step": 495 }, { - "completion_length": 63.97916793823242, - "epoch": 7.928, - "grad_norm": 0.6522864663580373, - "kl": 0.3603515625, + "completion_length": 38.458335876464844, + "epoch": 7.88, + "grad_norm": 2.8350462414240383, + "kl": 0.3203125, "learning_rate": 8.391159329496079e-11, - "loss": 0.0004, - "reward": 0.8750000298023224, - "reward_std": 0.2630349025130272, - "rewards/correct_code_reward_func": 0.3750000149011612, + "loss": 0.0003, + "reward": 0.9583333432674408, + "reward_std": 0.38511236757040024, + "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.5, "step": 496 }, { - "completion_length": 54.87500190734863, - "epoch": 7.944, - "grad_norm": 0.656638108368775, - "kl": 0.29296875, + "completion_length": 22.937500953674316, + "epoch": 7.896, + "grad_norm": 9.26990536147569, + "kl": 0.361328125, "learning_rate": 4.7201426506854324e-11, - "loss": 0.0003, - "reward": 0.9375, - "reward_std": 0.2041093371808529, - "rewards/correct_code_reward_func": 0.4375, + "loss": 0.0004, + "reward": 1.0208333432674408, + "reward_std": 0.42644475400447845, + "rewards/correct_code_reward_func": 0.5208333432674408, "rewards/len_reward_func": 0.5, "step": 497 }, { - "completion_length": 61.50000190734863, - "epoch": 7.96, - "grad_norm": 0.7980497826899465, - "kl": 0.17626953125, + "completion_length": 35.93750190734863, + "epoch": 7.912, + "grad_norm": 3.8907781924394014, + "kl": 0.40234375, "learning_rate": 2.097877854204122e-11, - "loss": 0.0002, - "reward": 0.9583333432674408, - "reward_std": 0.2994871288537979, - "rewards/correct_code_reward_func": 0.4583333358168602, + "loss": 0.0004, + "reward": 0.8333333730697632, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 498 }, { - "completion_length": 32.666666984558105, - "epoch": 7.976, - "grad_norm": 0.7504471109992272, - "kl": 1.03125, + "completion_length": 38.645835876464844, + "epoch": 7.928, + "grad_norm": 13.982592715406271, + "kl": 17.47998046875, "learning_rate": 5.244749650301639e-12, - "loss": 0.001, - "reward": 1.1041666865348816, - "reward_std": 0.2041093371808529, - "rewards/correct_code_reward_func": 0.6041666865348816, + "loss": 0.0175, + "reward": 1.041666716337204, + "reward_std": 0.16623875498771667, + "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.5, "step": 499 }, { - "completion_length": 53.52083396911621, - "epoch": 7.992, - "grad_norm": 1.6068675739272436, - "kl": 0.3828125, + "completion_length": 29.500001907348633, + "epoch": 7.944, + "grad_norm": 0.028883891880915308, + "kl": 2.763671875, "learning_rate": 0.0, - "loss": 0.0004, - "reward": 0.8125, - "reward_std": 0.39911213517189026, - "rewards/correct_code_reward_func": 0.3125, + "loss": 0.0028, + "reward": 0.8333333730697632, + "reward_std": 0.0, + "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.5, "step": 500 }, { - "epoch": 7.992, + "epoch": 7.944, "step": 500, "total_flos": 0.0, - "train_loss": 0.0022241791886344797, - "train_runtime": 11520.3381, - "train_samples_per_second": 0.26, - "train_steps_per_second": 0.043 + "train_loss": 0.019628612981648565, + "train_runtime": 7912.6404, + "train_samples_per_second": 0.379, + "train_steps_per_second": 0.063 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 9, - "save_steps": 25, + "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": {