{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21488047273704003, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 3253.0, "epoch": 0.00042976094547408005, "grad_norm": 0.05285495147109032, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0035, "reward": -0.2934060515835881, "reward_std": 0.3901851810514927, "rewards/cosine_scaled_reward": -0.21961969044059515, "rewards/format_reward": 0.14583333395421505, "step": 1 }, { "completion_length": 3107.5625610351562, "epoch": 0.0008595218909481601, "grad_norm": 0.13000866770744324, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0316, "reward": 0.02774460567161441, "reward_std": 0.8293320126831532, "rewards/cosine_scaled_reward": -0.15279437159188092, "rewards/format_reward": 0.33333333767950535, "step": 2 }, { "completion_length": 3353.7083740234375, "epoch": 0.0012892828364222402, "grad_norm": 0.07120331376791, "kl": 7.002800703048706e-05, "learning_rate": 6e-08, "loss": 0.0141, "reward": -0.37202255614101887, "reward_std": 0.3974655419588089, "rewards/cosine_scaled_reward": -0.2693446185439825, "rewards/format_reward": 0.16666667349636555, "step": 3 }, { "completion_length": 3528.6041870117188, "epoch": 0.0017190437818963202, "grad_norm": 0.126618430018425, "kl": 5.048513412475586e-05, "learning_rate": 8e-08, "loss": 0.0258, "reward": -0.21557927876710892, "reward_std": 0.6568086072802544, "rewards/cosine_scaled_reward": -0.1598729733377695, "rewards/format_reward": 0.1041666679084301, "step": 4 }, { "completion_length": 2682.7916946411133, "epoch": 0.0021488047273704003, "grad_norm": 0.059518564492464066, "kl": 3.789365291595459e-05, "learning_rate": 1e-07, "loss": 0.0185, "reward": 0.3876744210720062, "reward_std": 0.4762498773634434, "rewards/cosine_scaled_reward": -0.0249127852730453, "rewards/format_reward": 0.4375000074505806, "step": 5 }, { "completion_length": 3289.250030517578, "epoch": 0.0025785656728444803, "grad_norm": 0.16840262711048126, "kl": 4.09930944442749e-05, "learning_rate": 1.2e-07, "loss": 0.0948, "reward": 0.047209203243255615, "reward_std": 0.7325033247470856, "rewards/cosine_scaled_reward": -0.10139540303498507, "rewards/format_reward": 0.2500000037252903, "step": 6 }, { "completion_length": 3163.354248046875, "epoch": 0.0030083266183185604, "grad_norm": 0.07846385985612869, "kl": 3.8057565689086914e-05, "learning_rate": 1.4e-07, "loss": 0.0414, "reward": -0.278631275286898, "reward_std": 0.47105691581964493, "rewards/cosine_scaled_reward": -0.2747323103249073, "rewards/format_reward": 0.27083333767950535, "step": 7 }, { "completion_length": 3571.8125, "epoch": 0.0034380875637926404, "grad_norm": 0.06993106007575989, "kl": 3.971904516220093e-05, "learning_rate": 1.6e-07, "loss": 0.0014, "reward": -0.4162732649128884, "reward_std": 0.42826348543167114, "rewards/cosine_scaled_reward": -0.2289699697867036, "rewards/format_reward": 0.0416666679084301, "step": 8 }, { "completion_length": 2767.1666946411133, "epoch": 0.0038678485092667205, "grad_norm": 0.08935723453760147, "kl": 2.5838613510131836e-05, "learning_rate": 1.8e-07, "loss": 0.0137, "reward": 0.30400341004133224, "reward_std": 0.3838596399873495, "rewards/cosine_scaled_reward": -0.004248311743140221, "rewards/format_reward": 0.3125, "step": 9 }, { "completion_length": 3455.2291870117188, "epoch": 0.0042976094547408005, "grad_norm": 0.08207973837852478, "kl": 2.745445817708969e-05, "learning_rate": 2e-07, "loss": 0.0413, "reward": -0.3087661569006741, "reward_std": 0.5754895862191916, "rewards/cosine_scaled_reward": -0.21688307914882898, "rewards/format_reward": 0.12500000186264515, "step": 10 }, { "completion_length": 3306.2708740234375, "epoch": 0.00472737040021488, "grad_norm": 0.12304508686065674, "kl": 5.7816505432128906e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0612, "reward": -0.39621282927691936, "reward_std": 0.6319154910743237, "rewards/cosine_scaled_reward": -0.29185639694333076, "rewards/format_reward": 0.1875000037252903, "step": 11 }, { "completion_length": 3239.291717529297, "epoch": 0.005157131345688961, "grad_norm": 0.20740452408790588, "kl": 4.238635301589966e-05, "learning_rate": 2.4e-07, "loss": 0.0644, "reward": -0.1713746078312397, "reward_std": 0.5698597133159637, "rewards/cosine_scaled_reward": -0.1898539732210338, "rewards/format_reward": 0.2083333395421505, "step": 12 }, { "completion_length": 3380.9791870117188, "epoch": 0.00558689229116304, "grad_norm": 0.12278042733669281, "kl": 3.8743019104003906e-05, "learning_rate": 2.6e-07, "loss": 0.0172, "reward": 0.017920807003974915, "reward_std": 0.7111752815544605, "rewards/cosine_scaled_reward": -0.12645625788718462, "rewards/format_reward": 0.2708333432674408, "step": 13 }, { "completion_length": 3532.812530517578, "epoch": 0.006016653236637121, "grad_norm": 0.13022691011428833, "kl": 5.067698657512665e-05, "learning_rate": 2.8e-07, "loss": 0.0316, "reward": -0.41287917108274996, "reward_std": 0.6203740425407887, "rewards/cosine_scaled_reward": -0.23768958542495966, "rewards/format_reward": 0.06250000186264515, "step": 14 }, { "completion_length": 3509.7291870117188, "epoch": 0.0064464141821112, "grad_norm": 0.10133329033851624, "kl": 5.383789539337158e-05, "learning_rate": 3e-07, "loss": 0.0078, "reward": -0.33697362802922726, "reward_std": 0.5880320444703102, "rewards/cosine_scaled_reward": -0.2622368196025491, "rewards/format_reward": 0.18750000558793545, "step": 15 }, { "completion_length": 3331.291717529297, "epoch": 0.006876175127585281, "grad_norm": 0.13719122111797333, "kl": 4.747509956359863e-05, "learning_rate": 3.2e-07, "loss": 0.0313, "reward": -0.17862147465348244, "reward_std": 0.531953739002347, "rewards/cosine_scaled_reward": -0.18306074477732182, "rewards/format_reward": 0.18750000186264515, "step": 16 }, { "completion_length": 3114.2291679382324, "epoch": 0.0073059360730593605, "grad_norm": 0.08354074507951736, "kl": 4.62457537651062e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0201, "reward": 0.10213145613670349, "reward_std": 0.4036296680569649, "rewards/cosine_scaled_reward": -0.032267618109472096, "rewards/format_reward": 0.1666666679084301, "step": 17 }, { "completion_length": 3196.250030517578, "epoch": 0.007735697018533441, "grad_norm": 0.21876828372478485, "kl": 5.27501106262207e-05, "learning_rate": 3.6e-07, "loss": 0.0631, "reward": -0.09505215287208557, "reward_std": 0.5342105738818645, "rewards/cosine_scaled_reward": -0.13085942575708032, "rewards/format_reward": 0.1666666716337204, "step": 18 }, { "completion_length": 3439.291717529297, "epoch": 0.00816545796400752, "grad_norm": 0.10593213140964508, "kl": 5.320459604263306e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0385, "reward": -0.32695348002016544, "reward_std": 0.5262767961248755, "rewards/cosine_scaled_reward": -0.2259767409414053, "rewards/format_reward": 0.1250000037252903, "step": 19 }, { "completion_length": 3035.979217529297, "epoch": 0.008595218909481601, "grad_norm": 0.21864020824432373, "kl": 4.190206527709961e-05, "learning_rate": 4e-07, "loss": 0.0553, "reward": 0.26495958119630814, "reward_std": 0.7490959875285625, "rewards/cosine_scaled_reward": -0.044603541027754545, "rewards/format_reward": 0.3541666716337204, "step": 20 }, { "completion_length": 3106.791702270508, "epoch": 0.009024979854955682, "grad_norm": 0.10922188311815262, "kl": 2.148374915122986e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0396, "reward": 0.13372548669576645, "reward_std": 0.6249867780134082, "rewards/cosine_scaled_reward": -0.05813724081963301, "rewards/format_reward": 0.25000000186264515, "step": 21 }, { "completion_length": 3425.6458740234375, "epoch": 0.00945474080042976, "grad_norm": 0.13758547604084015, "kl": 4.2632222175598145e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0467, "reward": -0.12438997253775597, "reward_std": 0.7269462086260319, "rewards/cosine_scaled_reward": -0.15594498888822272, "rewards/format_reward": 0.18750000186264515, "step": 22 }, { "completion_length": 3367.7083435058594, "epoch": 0.00988450174590384, "grad_norm": 0.09374785423278809, "kl": 5.008280277252197e-05, "learning_rate": 4.6e-07, "loss": 0.0208, "reward": -0.2673447486013174, "reward_std": 0.5117070078849792, "rewards/cosine_scaled_reward": -0.19617237476632, "rewards/format_reward": 0.1250000037252903, "step": 23 }, { "completion_length": 3511.687530517578, "epoch": 0.010314262691377921, "grad_norm": 0.11542374640703201, "kl": 3.9769336581230164e-05, "learning_rate": 4.8e-07, "loss": 0.022, "reward": -0.4350108131766319, "reward_std": 0.39801024086773396, "rewards/cosine_scaled_reward": -0.2487554084509611, "rewards/format_reward": 0.06250000186264515, "step": 24 }, { "completion_length": 3159.2916870117188, "epoch": 0.010744023636852002, "grad_norm": 0.14854171872138977, "kl": 2.690078690648079e-05, "learning_rate": 5e-07, "loss": 0.0134, "reward": 0.05546778813004494, "reward_std": 0.6898578107357025, "rewards/cosine_scaled_reward": -0.08684943057596684, "rewards/format_reward": 0.2291666679084301, "step": 25 }, { "completion_length": 3132.125015258789, "epoch": 0.01117378458232608, "grad_norm": 0.14119301736354828, "kl": 1.812633126974106e-05, "learning_rate": 5.2e-07, "loss": 0.1173, "reward": -0.061668234411627054, "reward_std": 0.682558810338378, "rewards/cosine_scaled_reward": -0.14541744999587536, "rewards/format_reward": 0.2291666716337204, "step": 26 }, { "completion_length": 3474.1458435058594, "epoch": 0.011603545527800161, "grad_norm": 0.09565360099077225, "kl": 3.404170274734497e-05, "learning_rate": 5.4e-07, "loss": 0.0169, "reward": -0.2776073571294546, "reward_std": 0.49034018255770206, "rewards/cosine_scaled_reward": -0.19088700972497463, "rewards/format_reward": 0.10416666977107525, "step": 27 }, { "completion_length": 3484.8958435058594, "epoch": 0.012033306473274242, "grad_norm": 0.0821179524064064, "kl": 3.4656375646591187e-05, "learning_rate": 5.6e-07, "loss": 0.0141, "reward": -0.3942480720579624, "reward_std": 0.4312909319996834, "rewards/cosine_scaled_reward": -0.2596240369603038, "rewards/format_reward": 0.12500000186264515, "step": 28 }, { "completion_length": 3419.7708740234375, "epoch": 0.012463067418748322, "grad_norm": 0.17844527959823608, "kl": 3.374926745891571e-05, "learning_rate": 5.8e-07, "loss": 0.0338, "reward": -0.2870722636580467, "reward_std": 0.5572760831564665, "rewards/cosine_scaled_reward": -0.19561946392059326, "rewards/format_reward": 0.1041666679084301, "step": 29 }, { "completion_length": 3267.4791870117188, "epoch": 0.0128928283642224, "grad_norm": 0.0695347934961319, "kl": 5.2947551012039185e-05, "learning_rate": 6e-07, "loss": 0.02, "reward": -0.380221800878644, "reward_std": 0.35473627783358097, "rewards/cosine_scaled_reward": -0.2630275748670101, "rewards/format_reward": 0.1458333395421505, "step": 30 }, { "completion_length": 3152.2916870117188, "epoch": 0.013322589309696481, "grad_norm": 0.077418252825737, "kl": 4.14801761507988e-05, "learning_rate": 6.2e-07, "loss": 0.0351, "reward": -0.1975468024611473, "reward_std": 0.5000214390456676, "rewards/cosine_scaled_reward": -0.20294007752090693, "rewards/format_reward": 0.2083333358168602, "step": 31 }, { "completion_length": 3464.187530517578, "epoch": 0.013752350255170562, "grad_norm": 0.1896747648715973, "kl": 2.8077512979507446e-05, "learning_rate": 6.4e-07, "loss": 0.0446, "reward": -0.24830490379827097, "reward_std": 0.6987595185637474, "rewards/cosine_scaled_reward": -0.1970691168680787, "rewards/format_reward": 0.14583333767950535, "step": 32 }, { "completion_length": 3195.041717529297, "epoch": 0.014182111200644642, "grad_norm": 0.09090506285429001, "kl": 4.713237285614014e-05, "learning_rate": 6.6e-07, "loss": 0.0015, "reward": -0.04083757125772536, "reward_std": 0.521674145013094, "rewards/cosine_scaled_reward": -0.17666877806186676, "rewards/format_reward": 0.3125000037252903, "step": 33 }, { "completion_length": 3124.0208740234375, "epoch": 0.014611872146118721, "grad_norm": 0.13796433806419373, "kl": 3.180652856826782e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0319, "reward": 0.003058883361518383, "reward_std": 0.8531513474881649, "rewards/cosine_scaled_reward": -0.14430390298366547, "rewards/format_reward": 0.291666679084301, "step": 34 }, { "completion_length": 3497.437530517578, "epoch": 0.015041633091592801, "grad_norm": 0.11644481867551804, "kl": 2.6782508939504623e-05, "learning_rate": 7e-07, "loss": 0.0063, "reward": -0.31570737808942795, "reward_std": 0.6755894236266613, "rewards/cosine_scaled_reward": -0.23077034763991833, "rewards/format_reward": 0.14583333767950535, "step": 35 }, { "completion_length": 3424.875030517578, "epoch": 0.015471394037066882, "grad_norm": 0.10873474180698395, "kl": 5.556643009185791e-05, "learning_rate": 7.2e-07, "loss": 0.0312, "reward": -0.34113267669454217, "reward_std": 0.6471816077828407, "rewards/cosine_scaled_reward": -0.25389967393130064, "rewards/format_reward": 0.16666666977107525, "step": 36 }, { "completion_length": 3270.500015258789, "epoch": 0.015901154982540962, "grad_norm": 0.12086840718984604, "kl": 0.00013577681966125965, "learning_rate": 7.4e-07, "loss": 0.0127, "reward": -0.08354963641613722, "reward_std": 0.680159542709589, "rewards/cosine_scaled_reward": -0.16677482542581856, "rewards/format_reward": 0.25000000186264515, "step": 37 }, { "completion_length": 2747.083351135254, "epoch": 0.01633091592801504, "grad_norm": 0.10436119884252548, "kl": 0.00017508864402770996, "learning_rate": 7.599999999999999e-07, "loss": 0.0205, "reward": -0.09750039968639612, "reward_std": 0.5464910175651312, "rewards/cosine_scaled_reward": -0.23625020775943995, "rewards/format_reward": 0.3750000037252903, "step": 38 }, { "completion_length": 2956.3541870117188, "epoch": 0.01676067687348912, "grad_norm": 0.17674781382083893, "kl": 0.00010013580322265625, "learning_rate": 7.799999999999999e-07, "loss": 0.0757, "reward": 0.21702358219772577, "reward_std": 0.7503162249922752, "rewards/cosine_scaled_reward": -0.04773822211427614, "rewards/format_reward": 0.31250000558793545, "step": 39 }, { "completion_length": 3458.7708435058594, "epoch": 0.017190437818963202, "grad_norm": 0.08916423469781876, "kl": 0.000110674649477005, "learning_rate": 8e-07, "loss": 0.0213, "reward": -0.49304681923240423, "reward_std": 0.47094444930553436, "rewards/cosine_scaled_reward": -0.2777734100818634, "rewards/format_reward": 0.06250000186264515, "step": 40 }, { "completion_length": 3565.562530517578, "epoch": 0.01762019876443728, "grad_norm": 0.10349541157484055, "kl": 0.00018181651830673218, "learning_rate": 8.199999999999999e-07, "loss": 0.0059, "reward": -0.4495491646230221, "reward_std": 0.5272008180618286, "rewards/cosine_scaled_reward": -0.25602457765489817, "rewards/format_reward": 0.06250000186264515, "step": 41 }, { "completion_length": 2912.354217529297, "epoch": 0.018049959709911363, "grad_norm": 0.14541999995708466, "kl": 7.766485214233398e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0659, "reward": 0.6349759213626385, "reward_std": 0.7166076265275478, "rewards/cosine_scaled_reward": 0.09873794205486774, "rewards/format_reward": 0.43750001303851604, "step": 42 }, { "completion_length": 3571.0833435058594, "epoch": 0.018479720655385442, "grad_norm": 0.08275973051786423, "kl": 7.909350097179413e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0082, "reward": -0.436044542118907, "reward_std": 0.48770271986722946, "rewards/cosine_scaled_reward": -0.2388556022197008, "rewards/format_reward": 0.0416666679084301, "step": 43 }, { "completion_length": 3427.0625, "epoch": 0.01890948160085952, "grad_norm": 0.06302553415298462, "kl": 0.00016103684902191162, "learning_rate": 8.799999999999999e-07, "loss": -0.0012, "reward": 0.1888022180646658, "reward_std": 0.36756357457488775, "rewards/cosine_scaled_reward": 0.0006511025130748749, "rewards/format_reward": 0.18750000186264515, "step": 44 }, { "completion_length": 3324.9583435058594, "epoch": 0.019339242546333603, "grad_norm": 0.16473354399204254, "kl": 0.00016005896031856537, "learning_rate": 9e-07, "loss": 0.0398, "reward": 0.28645146638154984, "reward_std": 0.7519912570714951, "rewards/cosine_scaled_reward": 0.007809067144989967, "rewards/format_reward": 0.27083334140479565, "step": 45 }, { "completion_length": 3477.25, "epoch": 0.01976900349180768, "grad_norm": 0.09687305241823196, "kl": 4.568975418806076e-05, "learning_rate": 9.2e-07, "loss": 0.0312, "reward": -0.40328337997198105, "reward_std": 0.47868940234184265, "rewards/cosine_scaled_reward": -0.24330836161971092, "rewards/format_reward": 0.0833333358168602, "step": 46 }, { "completion_length": 3411.791748046875, "epoch": 0.02019876443728176, "grad_norm": 0.11385305225849152, "kl": 0.00021082162857055664, "learning_rate": 9.399999999999999e-07, "loss": 0.0223, "reward": -0.07047207280993462, "reward_std": 0.6432409286499023, "rewards/cosine_scaled_reward": -0.2019027005881071, "rewards/format_reward": 0.3333333395421505, "step": 47 }, { "completion_length": 3334.5208740234375, "epoch": 0.020628525382755843, "grad_norm": 0.12270227819681168, "kl": 0.00033026933670043945, "learning_rate": 9.6e-07, "loss": 0.0703, "reward": -0.12003640341572464, "reward_std": 0.6481129042804241, "rewards/cosine_scaled_reward": -0.18501820880919695, "rewards/format_reward": 0.25000000558793545, "step": 48 }, { "completion_length": 3545.4166870117188, "epoch": 0.02105828632822992, "grad_norm": 0.07848583906888962, "kl": 0.00020009279251098633, "learning_rate": 9.8e-07, "loss": 0.0053, "reward": -0.5022422587499022, "reward_std": 0.4621626250445843, "rewards/cosine_scaled_reward": -0.28237113170325756, "rewards/format_reward": 0.06250000186264515, "step": 49 }, { "completion_length": 3144.041702270508, "epoch": 0.021488047273704004, "grad_norm": 0.1617872267961502, "kl": 0.0005618929862976074, "learning_rate": 1e-06, "loss": 0.0754, "reward": 0.1075476721744053, "reward_std": 0.8246349208056927, "rewards/cosine_scaled_reward": -0.1233095005445648, "rewards/format_reward": 0.3541666716337204, "step": 50 }, { "completion_length": 3563.9583435058594, "epoch": 0.021917808219178082, "grad_norm": 0.0936054214835167, "kl": 0.00014024972915649414, "learning_rate": 9.999890338174275e-07, "loss": 0.0166, "reward": -0.34706551616545767, "reward_std": 0.596009723842144, "rewards/cosine_scaled_reward": -0.20478275511413813, "rewards/format_reward": 0.06250000186264515, "step": 51 }, { "completion_length": 3161.7083435058594, "epoch": 0.02234756916465216, "grad_norm": 0.11951952427625656, "kl": 0.0007489323616027832, "learning_rate": 9.999561358041868e-07, "loss": 0.0441, "reward": 0.03817065432667732, "reward_std": 0.4603902120143175, "rewards/cosine_scaled_reward": -0.09549800679087639, "rewards/format_reward": 0.2291666753590107, "step": 52 }, { "completion_length": 3226.250015258789, "epoch": 0.022777330110126243, "grad_norm": 0.10595440119504929, "kl": 0.00032591819763183594, "learning_rate": 9.999013075636804e-07, "loss": 0.0063, "reward": -0.3045101873576641, "reward_std": 0.482793964445591, "rewards/cosine_scaled_reward": -0.22517176298424602, "rewards/format_reward": 0.14583333395421505, "step": 53 }, { "completion_length": 3248.5208740234375, "epoch": 0.023207091055600322, "grad_norm": 0.2151467651128769, "kl": 0.0005822470411658287, "learning_rate": 9.998245517681593e-07, "loss": 0.0626, "reward": 0.4128889187704772, "reward_std": 1.250886969268322, "rewards/cosine_scaled_reward": 0.008527783211320639, "rewards/format_reward": 0.39583334885537624, "step": 54 }, { "completion_length": 3015.354202270508, "epoch": 0.0236368520010744, "grad_norm": 0.09920001775026321, "kl": 0.0009176731109619141, "learning_rate": 9.997258721585931e-07, "loss": 0.0271, "reward": 0.10421956982463598, "reward_std": 0.5782017223536968, "rewards/cosine_scaled_reward": -0.10414022067561746, "rewards/format_reward": 0.31250000186264515, "step": 55 }, { "completion_length": 3524.3958740234375, "epoch": 0.024066612946548483, "grad_norm": 0.1504775583744049, "kl": 0.0009249448776245117, "learning_rate": 9.996052735444862e-07, "loss": 0.0285, "reward": -0.12897427380084991, "reward_std": 0.7604504376649857, "rewards/cosine_scaled_reward": -0.14782046154141426, "rewards/format_reward": 0.16666666977107525, "step": 56 }, { "completion_length": 3444.4583740234375, "epoch": 0.024496373892022562, "grad_norm": 0.16023826599121094, "kl": 0.0026568472385406494, "learning_rate": 9.994627618036452e-07, "loss": 0.0567, "reward": -0.20668507413938642, "reward_std": 0.6743349675089121, "rewards/cosine_scaled_reward": -0.16584253683686256, "rewards/format_reward": 0.12500000186264515, "step": 57 }, { "completion_length": 2869.687511444092, "epoch": 0.024926134837496644, "grad_norm": 0.06721269339323044, "kl": 0.0018393564969301224, "learning_rate": 9.992983438818915e-07, "loss": -0.0129, "reward": 0.4099194258451462, "reward_std": 0.5589842461049557, "rewards/cosine_scaled_reward": -0.0033736377954483032, "rewards/format_reward": 0.4166666679084301, "step": 58 }, { "completion_length": 3552.2916870117188, "epoch": 0.025355895782970723, "grad_norm": 0.06759820878505707, "kl": 0.0007061958312988281, "learning_rate": 9.991120277927223e-07, "loss": 0.0055, "reward": -0.4608234167098999, "reward_std": 0.3956527151167393, "rewards/cosine_scaled_reward": -0.25124504044651985, "rewards/format_reward": 0.0416666679084301, "step": 59 }, { "completion_length": 3552.1041870117188, "epoch": 0.0257856567284448, "grad_norm": 0.08294526487588882, "kl": 0.0004831552505493164, "learning_rate": 9.989038226169207e-07, "loss": 0.0061, "reward": -0.4573524706065655, "reward_std": 0.43482602946460247, "rewards/cosine_scaled_reward": -0.270342905074358, "rewards/format_reward": 0.0833333358168602, "step": 60 }, { "completion_length": 3572.4583435058594, "epoch": 0.026215417673918884, "grad_norm": 0.10507713258266449, "kl": 0.0012664794921875, "learning_rate": 9.98673738502114e-07, "loss": 0.0065, "reward": -0.3749425411224365, "reward_std": 0.562547106295824, "rewards/cosine_scaled_reward": -0.20830460358411074, "rewards/format_reward": 0.0416666679084301, "step": 61 }, { "completion_length": 2949.2291717529297, "epoch": 0.026645178619392963, "grad_norm": 0.11524305492639542, "kl": 0.0003293752670288086, "learning_rate": 9.98421786662277e-07, "loss": 0.0001, "reward": 0.29203490912914276, "reward_std": 0.6473203133791685, "rewards/cosine_scaled_reward": -0.02064920263364911, "rewards/format_reward": 0.3333333432674408, "step": 62 }, { "completion_length": 3352.416717529297, "epoch": 0.02707493956486704, "grad_norm": 0.19914790987968445, "kl": 0.0017281174659729004, "learning_rate": 9.981479793771866e-07, "loss": 0.0823, "reward": -0.19455218221992254, "reward_std": 0.5499789193272591, "rewards/cosine_scaled_reward": -0.17019275948405266, "rewards/format_reward": 0.1458333358168602, "step": 63 }, { "completion_length": 3502.875030517578, "epoch": 0.027504700510341124, "grad_norm": 0.1207519918680191, "kl": 0.0012881755828857422, "learning_rate": 9.97852329991824e-07, "loss": 0.021, "reward": -0.28562765405513346, "reward_std": 0.6909630820155144, "rewards/cosine_scaled_reward": -0.23656382597982883, "rewards/format_reward": 0.18750000558793545, "step": 64 }, { "completion_length": 3395.3125, "epoch": 0.027934461455815202, "grad_norm": 0.06995224207639694, "kl": 0.0009489059448242188, "learning_rate": 9.975348529157229e-07, "loss": 0.0085, "reward": -0.015769858844578266, "reward_std": 0.4281216077506542, "rewards/cosine_scaled_reward": -0.10163494199514389, "rewards/format_reward": 0.18750000186264515, "step": 65 }, { "completion_length": 3546.875, "epoch": 0.028364222401289284, "grad_norm": 0.09451273083686829, "kl": 0.000741124153137207, "learning_rate": 9.971955636222684e-07, "loss": 0.0221, "reward": -0.35794772882945836, "reward_std": 0.5166397821158171, "rewards/cosine_scaled_reward": -0.1998071982525289, "rewards/format_reward": 0.0416666679084301, "step": 66 }, { "completion_length": 2604.479217529297, "epoch": 0.028793983346763363, "grad_norm": 0.16432027518749237, "kl": 0.004400730133056641, "learning_rate": 9.968344786479415e-07, "loss": 0.0814, "reward": 0.321387130767107, "reward_std": 0.784416887909174, "rewards/cosine_scaled_reward": -0.0580564308911562, "rewards/format_reward": 0.43750000558793545, "step": 67 }, { "completion_length": 2892.854202270508, "epoch": 0.029223744292237442, "grad_norm": 0.11767356097698212, "kl": 0.013798236846923828, "learning_rate": 9.964516155915151e-07, "loss": 0.0355, "reward": 0.1110092531889677, "reward_std": 0.7136748433113098, "rewards/cosine_scaled_reward": -0.09032870596274734, "rewards/format_reward": 0.29166667349636555, "step": 68 }, { "completion_length": 3475.7500610351562, "epoch": 0.029653505237711524, "grad_norm": 0.15884071588516235, "kl": 0.0012587904930114746, "learning_rate": 9.960469931131936e-07, "loss": 0.0321, "reward": 0.010828567203134298, "reward_std": 0.8198906406760216, "rewards/cosine_scaled_reward": -0.08833572082221508, "rewards/format_reward": 0.1875000037252903, "step": 69 }, { "completion_length": 3235.9791870117188, "epoch": 0.030083266183185603, "grad_norm": 0.13603372871875763, "kl": 0.0033148527145385742, "learning_rate": 9.956206309337066e-07, "loss": 0.0568, "reward": -0.04882128722965717, "reward_std": 0.6749724298715591, "rewards/cosine_scaled_reward": -0.180660642683506, "rewards/format_reward": 0.31250000931322575, "step": 70 }, { "completion_length": 3465.812530517578, "epoch": 0.03051302712865968, "grad_norm": 0.09531030058860779, "kl": 0.0042400360107421875, "learning_rate": 9.951725498333448e-07, "loss": 0.0244, "reward": -0.23902579583227634, "reward_std": 0.5069700628519058, "rewards/cosine_scaled_reward": -0.16117956209927797, "rewards/format_reward": 0.0833333358168602, "step": 71 }, { "completion_length": 3112.1875, "epoch": 0.030942788074133764, "grad_norm": 0.1034625694155693, "kl": 0.0016942024230957031, "learning_rate": 9.947027716509488e-07, "loss": 0.0351, "reward": 0.3098149821162224, "reward_std": 0.5386030115187168, "rewards/cosine_scaled_reward": 0.009074151515960693, "rewards/format_reward": 0.29166667349636555, "step": 72 }, { "completion_length": 2707.5833587646484, "epoch": 0.03137254901960784, "grad_norm": 0.1078762486577034, "kl": 0.0032265186309814453, "learning_rate": 9.942113192828444e-07, "loss": 0.0475, "reward": -0.044910063967108727, "reward_std": 0.5724170878529549, "rewards/cosine_scaled_reward": -0.22037170524708927, "rewards/format_reward": 0.3958333358168602, "step": 73 }, { "completion_length": 3196.9583435058594, "epoch": 0.031802309965081925, "grad_norm": 0.11188928782939911, "kl": 0.001481473445892334, "learning_rate": 9.93698216681727e-07, "loss": 0.0181, "reward": -0.11161274090409279, "reward_std": 0.49758703261613846, "rewards/cosine_scaled_reward": -0.13913970813155174, "rewards/format_reward": 0.1666666679084301, "step": 74 }, { "completion_length": 3096.875030517578, "epoch": 0.032232070910556, "grad_norm": 0.15704314410686493, "kl": 0.0028219223022460938, "learning_rate": 9.931634888554935e-07, "loss": 0.0591, "reward": -0.008646398782730103, "reward_std": 0.6664852034300566, "rewards/cosine_scaled_reward": -0.12932320777326822, "rewards/format_reward": 0.25, "step": 75 }, { "completion_length": 3209.5208587646484, "epoch": 0.03266183185603008, "grad_norm": 0.11482009291648865, "kl": 0.002855837345123291, "learning_rate": 9.926071618660237e-07, "loss": -0.0105, "reward": 0.056005772203207016, "reward_std": 0.6242523468099535, "rewards/cosine_scaled_reward": -0.11783044971525669, "rewards/format_reward": 0.2916666716337204, "step": 76 }, { "completion_length": 3292.2291870117188, "epoch": 0.033091592801504165, "grad_norm": 0.12117301672697067, "kl": 0.0023589134216308594, "learning_rate": 9.9202926282791e-07, "loss": 0.0332, "reward": -0.10070406273007393, "reward_std": 0.7597025670111179, "rewards/cosine_scaled_reward": -0.14410204160958529, "rewards/format_reward": 0.18750000186264515, "step": 77 }, { "completion_length": 3529.375030517578, "epoch": 0.03352135374697824, "grad_norm": 0.1249091699719429, "kl": 0.002429485321044922, "learning_rate": 9.91429819907136e-07, "loss": 0.0212, "reward": -0.30287766456604004, "reward_std": 0.5055086389183998, "rewards/cosine_scaled_reward": -0.18268883368000388, "rewards/format_reward": 0.06250000186264515, "step": 78 }, { "completion_length": 3567.750030517578, "epoch": 0.03395111469245232, "grad_norm": 0.06531081348657608, "kl": 0.0036420822143554688, "learning_rate": 9.908088623197048e-07, "loss": 0.0018, "reward": -0.46082823909819126, "reward_std": 0.37099857069551945, "rewards/cosine_scaled_reward": -0.26166411861777306, "rewards/format_reward": 0.06250000186264515, "step": 79 }, { "completion_length": 3467.9166870117188, "epoch": 0.034380875637926404, "grad_norm": 0.1288522332906723, "kl": 0.003486037254333496, "learning_rate": 9.901664203302124e-07, "loss": 0.0443, "reward": -0.12184088630601764, "reward_std": 0.649023462086916, "rewards/cosine_scaled_reward": -0.14425377547740936, "rewards/format_reward": 0.1666666716337204, "step": 80 }, { "completion_length": 3025.5208587646484, "epoch": 0.03481063658340049, "grad_norm": 0.10254532843828201, "kl": 0.0027785301208496094, "learning_rate": 9.895025252503755e-07, "loss": 0.0598, "reward": 0.0099591463804245, "reward_std": 0.5616412572562695, "rewards/cosine_scaled_reward": -0.10960375843569636, "rewards/format_reward": 0.22916666977107525, "step": 81 }, { "completion_length": 3215.1041870117188, "epoch": 0.03524039752887456, "grad_norm": 0.10801918804645538, "kl": 0.0048482418060302734, "learning_rate": 9.888172094375033e-07, "loss": 0.0423, "reward": -0.08801804296672344, "reward_std": 0.5365799330174923, "rewards/cosine_scaled_reward": -0.1377590410411358, "rewards/format_reward": 0.18750000186264515, "step": 82 }, { "completion_length": 3561.8333435058594, "epoch": 0.035670158474348644, "grad_norm": 0.05757669359445572, "kl": 0.0035609006881713867, "learning_rate": 9.881105062929221e-07, "loss": 0.0066, "reward": -0.5362846422940493, "reward_std": 0.3612187523394823, "rewards/cosine_scaled_reward": -0.27855898439884186, "rewards/format_reward": 0.02083333395421505, "step": 83 }, { "completion_length": 3555.125030517578, "epoch": 0.036099919419822726, "grad_norm": 0.11531209945678711, "kl": 0.0016581416130065918, "learning_rate": 9.873824502603459e-07, "loss": 0.0145, "reward": -0.21873824088834226, "reward_std": 0.607675589621067, "rewards/cosine_scaled_reward": -0.1510357903316617, "rewards/format_reward": 0.0833333358168602, "step": 84 }, { "completion_length": 3350.1041870117188, "epoch": 0.0365296803652968, "grad_norm": 0.21427154541015625, "kl": 0.016910433769226074, "learning_rate": 9.866330768241983e-07, "loss": 0.0256, "reward": 0.4170064367353916, "reward_std": 0.6577188894152641, "rewards/cosine_scaled_reward": 0.031419893726706505, "rewards/format_reward": 0.3541666753590107, "step": 85 }, { "completion_length": 3257.625045776367, "epoch": 0.036959441310770884, "grad_norm": 0.1424005627632141, "kl": 0.0036377906799316406, "learning_rate": 9.85862422507884e-07, "loss": 0.0576, "reward": -0.11163964122533798, "reward_std": 0.6495732273906469, "rewards/cosine_scaled_reward": -0.14956981502473354, "rewards/format_reward": 0.1875000074505806, "step": 86 }, { "completion_length": 2831.2291717529297, "epoch": 0.037389202256244966, "grad_norm": 0.10236720740795135, "kl": 0.005584716796875, "learning_rate": 9.850705248720068e-07, "loss": 0.0632, "reward": 0.02417793497443199, "reward_std": 0.5560131501406431, "rewards/cosine_scaled_reward": -0.14416103437542915, "rewards/format_reward": 0.3125000149011612, "step": 87 }, { "completion_length": 3529.4791870117188, "epoch": 0.03781896320171904, "grad_norm": 0.14017605781555176, "kl": 0.0023620128631591797, "learning_rate": 9.8425742251254e-07, "loss": 0.0104, "reward": -0.19280400034040213, "reward_std": 0.7251274883747101, "rewards/cosine_scaled_reward": -0.16931866854429245, "rewards/format_reward": 0.14583333395421505, "step": 88 }, { "completion_length": 3472.7916870117188, "epoch": 0.038248724147193124, "grad_norm": 0.05401326343417168, "kl": 0.0041751861572265625, "learning_rate": 9.83423155058946e-07, "loss": 0.0011, "reward": -0.2916420176625252, "reward_std": 0.307975216768682, "rewards/cosine_scaled_reward": -0.2083210125565529, "rewards/format_reward": 0.125, "step": 89 }, { "completion_length": 3543.7291870117188, "epoch": 0.038678485092667206, "grad_norm": 0.12139870226383209, "kl": 0.0059053897857666016, "learning_rate": 9.825677631722435e-07, "loss": 0.0096, "reward": -0.44505423679947853, "reward_std": 0.5007496885955334, "rewards/cosine_scaled_reward": -0.27461045421659946, "rewards/format_reward": 0.10416666977107525, "step": 90 }, { "completion_length": 3380.625030517578, "epoch": 0.03910824603814128, "grad_norm": 0.1062501072883606, "kl": 0.00391697883605957, "learning_rate": 9.816912885430258e-07, "loss": 0.0197, "reward": -0.22237559407949448, "reward_std": 0.4433041997253895, "rewards/cosine_scaled_reward": -0.22577113937586546, "rewards/format_reward": 0.2291666679084301, "step": 91 }, { "completion_length": 3500.0416870117188, "epoch": 0.03953800698361536, "grad_norm": 0.14027561247348785, "kl": 0.00739598274230957, "learning_rate": 9.807937738894303e-07, "loss": 0.0303, "reward": -0.29646059684455395, "reward_std": 0.7104594595730305, "rewards/cosine_scaled_reward": -0.21073030307888985, "rewards/format_reward": 0.12500000186264515, "step": 92 }, { "completion_length": 3428.125, "epoch": 0.039967767929089446, "grad_norm": 0.10713568329811096, "kl": 0.004438877105712891, "learning_rate": 9.798752629550546e-07, "loss": -0.0082, "reward": -0.053891924559138715, "reward_std": 0.5902169775217772, "rewards/cosine_scaled_reward": -0.12069597654044628, "rewards/format_reward": 0.18750000558793545, "step": 93 }, { "completion_length": 3584.0, "epoch": 0.04039752887456352, "grad_norm": 0.048110585659742355, "kl": 0.004644632339477539, "learning_rate": 9.78935800506826e-07, "loss": 0.0002, "reward": -0.513519424945116, "reward_std": 0.30109312757849693, "rewards/cosine_scaled_reward": -0.25675972178578377, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 3492.6458740234375, "epoch": 0.0408272898200376, "grad_norm": 0.13268864154815674, "kl": 0.004797935485839844, "learning_rate": 9.779754323328192e-07, "loss": 0.0591, "reward": -0.26034877821803093, "reward_std": 0.6858366914093494, "rewards/cosine_scaled_reward": -0.17184105108026415, "rewards/format_reward": 0.0833333358168602, "step": 95 }, { "completion_length": 3340.7083740234375, "epoch": 0.041257050765511685, "grad_norm": 0.09160806983709335, "kl": 0.0044078826904296875, "learning_rate": 9.769942052400235e-07, "loss": 0.0283, "reward": -0.16240805014967918, "reward_std": 0.5213533006608486, "rewards/cosine_scaled_reward": -0.1853706892579794, "rewards/format_reward": 0.2083333432674408, "step": 96 }, { "completion_length": 3280.6041870117188, "epoch": 0.04168681171098577, "grad_norm": 0.12961438298225403, "kl": 0.005374908447265625, "learning_rate": 9.759921670520634e-07, "loss": 0.0264, "reward": -0.15450410917401314, "reward_std": 0.6175425332039595, "rewards/cosine_scaled_reward": -0.17100205505266786, "rewards/format_reward": 0.18750000186264515, "step": 97 }, { "completion_length": 3584.0, "epoch": 0.04211657265645984, "grad_norm": 0.10360225290060043, "kl": 0.0040416717529296875, "learning_rate": 9.749693666068663e-07, "loss": 0.0002, "reward": -0.3778139092028141, "reward_std": 0.46871285885572433, "rewards/cosine_scaled_reward": -0.1993236212292686, "rewards/format_reward": 0.02083333395421505, "step": 98 }, { "completion_length": 3474.9583435058594, "epoch": 0.042546333601933925, "grad_norm": 0.1495775580406189, "kl": 0.0042266845703125, "learning_rate": 9.739258537542835e-07, "loss": 0.0337, "reward": 0.04929289221763611, "reward_std": 0.8450847752392292, "rewards/cosine_scaled_reward": -0.07952022459357977, "rewards/format_reward": 0.2083333395421505, "step": 99 }, { "completion_length": 3367.791748046875, "epoch": 0.04297609454740801, "grad_norm": 0.16465169191360474, "kl": 0.00579071044921875, "learning_rate": 9.728616793536587e-07, "loss": 0.029, "reward": -0.01972397044301033, "reward_std": 0.9458978213369846, "rewards/cosine_scaled_reward": -0.11402865406125784, "rewards/format_reward": 0.2083333395421505, "step": 100 }, { "completion_length": 3584.0, "epoch": 0.04340585549288208, "grad_norm": 0.09728045016527176, "kl": 0.0034818649291992188, "learning_rate": 9.717768952713511e-07, "loss": 0.0001, "reward": -0.4636741569265723, "reward_std": 0.46765993162989616, "rewards/cosine_scaled_reward": -0.2422537449747324, "rewards/format_reward": 0.02083333395421505, "step": 101 }, { "completion_length": 3504.0000610351562, "epoch": 0.043835616438356165, "grad_norm": 0.15874584019184113, "kl": 0.00865936279296875, "learning_rate": 9.706715543782064e-07, "loss": 0.0514, "reward": -0.1984290238469839, "reward_std": 0.726914469152689, "rewards/cosine_scaled_reward": -0.15129784308373928, "rewards/format_reward": 0.10416666977107525, "step": 102 }, { "completion_length": 3578.062530517578, "epoch": 0.04426537738383025, "grad_norm": 0.15544135868549347, "kl": 0.006952762603759766, "learning_rate": 9.695457105469804e-07, "loss": 0.0046, "reward": -0.17355967313051224, "reward_std": 0.7352991811931133, "rewards/cosine_scaled_reward": -0.12844650604529306, "rewards/format_reward": 0.0833333358168602, "step": 103 }, { "completion_length": 3575.1875, "epoch": 0.04469513832930432, "grad_norm": 0.076705701649189, "kl": 0.006405830383300781, "learning_rate": 9.683994186497132e-07, "loss": 0.0029, "reward": -0.411870326846838, "reward_std": 0.39427221193909645, "rewards/cosine_scaled_reward": -0.22676849737763405, "rewards/format_reward": 0.0416666679084301, "step": 104 }, { "completion_length": 3446.000030517578, "epoch": 0.045124899274778404, "grad_norm": 0.09668397903442383, "kl": 0.004080772399902344, "learning_rate": 9.672327345550543e-07, "loss": 0.0249, "reward": -0.31605136953294277, "reward_std": 0.5398397855460644, "rewards/cosine_scaled_reward": -0.21010902244597673, "rewards/format_reward": 0.10416666977107525, "step": 105 }, { "completion_length": 3088.8125610351562, "epoch": 0.04555466022025249, "grad_norm": 0.16100294888019562, "kl": 0.005070209503173828, "learning_rate": 9.66045715125541e-07, "loss": 0.0617, "reward": 0.3683682959526777, "reward_std": 1.0179261192679405, "rewards/cosine_scaled_reward": -0.003315861918963492, "rewards/format_reward": 0.37500000558793545, "step": 106 }, { "completion_length": 3360.5209045410156, "epoch": 0.04598442116572656, "grad_norm": 0.17470990121364594, "kl": 0.004656791687011719, "learning_rate": 9.648384182148252e-07, "loss": 0.0635, "reward": 0.14809904247522354, "reward_std": 1.0914807617664337, "rewards/cosine_scaled_reward": -0.08220048528164625, "rewards/format_reward": 0.3125000074505806, "step": 107 }, { "completion_length": 3417.6041870117188, "epoch": 0.046414182111200644, "grad_norm": 0.12381262332201004, "kl": 0.005648612976074219, "learning_rate": 9.636109026648554e-07, "loss": 0.0041, "reward": -0.13490474259015173, "reward_std": 0.6939134374260902, "rewards/cosine_scaled_reward": -0.22370237647555768, "rewards/format_reward": 0.3125000074505806, "step": 108 }, { "completion_length": 3167.958366394043, "epoch": 0.046843943056674726, "grad_norm": 0.17516814172267914, "kl": 0.006865739822387695, "learning_rate": 9.623632283030077e-07, "loss": 0.0323, "reward": 0.1824771724641323, "reward_std": 0.4511121194809675, "rewards/cosine_scaled_reward": -0.023344729095697403, "rewards/format_reward": 0.22916666977107525, "step": 109 }, { "completion_length": 3441.75, "epoch": 0.0472737040021488, "grad_norm": 0.1555563062429428, "kl": 0.0050525665283203125, "learning_rate": 9.610954559391704e-07, "loss": 0.029, "reward": -0.22800780087709427, "reward_std": 0.532591599971056, "rewards/cosine_scaled_reward": -0.18692057486623526, "rewards/format_reward": 0.1458333395421505, "step": 110 }, { "completion_length": 3421.2916870117188, "epoch": 0.047703464947622884, "grad_norm": 0.08717039972543716, "kl": 0.007814407348632812, "learning_rate": 9.598076473627796e-07, "loss": 0.0248, "reward": -0.3660214664414525, "reward_std": 0.4175565466284752, "rewards/cosine_scaled_reward": -0.2142607346177101, "rewards/format_reward": 0.0625, "step": 111 }, { "completion_length": 3060.5833587646484, "epoch": 0.048133225893096966, "grad_norm": 0.15273751318454742, "kl": 0.006236076354980469, "learning_rate": 9.58499865339809e-07, "loss": 0.0342, "reward": 0.17923793010413647, "reward_std": 0.7778838276863098, "rewards/cosine_scaled_reward": -0.07704770751297474, "rewards/format_reward": 0.3333333358168602, "step": 112 }, { "completion_length": 3450.6458740234375, "epoch": 0.04856298683857104, "grad_norm": 0.09182659536600113, "kl": 0.006908893585205078, "learning_rate": 9.571721736097088e-07, "loss": 0.0319, "reward": -0.6143023446202278, "reward_std": 0.4347798265516758, "rewards/cosine_scaled_reward": -0.3488178439438343, "rewards/format_reward": 0.0833333358168602, "step": 113 }, { "completion_length": 3440.8125, "epoch": 0.048992747784045124, "grad_norm": 0.10586244612932205, "kl": 0.009021759033203125, "learning_rate": 9.55824636882301e-07, "loss": 0.0529, "reward": -0.19712361320853233, "reward_std": 0.5763232838362455, "rewards/cosine_scaled_reward": -0.17147847125306726, "rewards/format_reward": 0.1458333358168602, "step": 114 }, { "completion_length": 3480.3333740234375, "epoch": 0.049422508729519206, "grad_norm": 0.1035589724779129, "kl": 0.0139923095703125, "learning_rate": 9.54457320834625e-07, "loss": 0.0163, "reward": -0.477382599376142, "reward_std": 0.46347317658364773, "rewards/cosine_scaled_reward": -0.29077463410794735, "rewards/format_reward": 0.10416666977107525, "step": 115 }, { "completion_length": 3465.6458740234375, "epoch": 0.04985226967499329, "grad_norm": 0.12334461510181427, "kl": 0.006648063659667969, "learning_rate": 9.530702921077358e-07, "loss": 0.029, "reward": -0.2585864200955257, "reward_std": 0.660937674343586, "rewards/cosine_scaled_reward": -0.1917932154610753, "rewards/format_reward": 0.1250000037252903, "step": 116 }, { "completion_length": 3395.9791870117188, "epoch": 0.05028203062046736, "grad_norm": 0.11316703259944916, "kl": 0.011442184448242188, "learning_rate": 9.516636183034564e-07, "loss": 0.0314, "reward": -0.19809363968670368, "reward_std": 0.6341227479279041, "rewards/cosine_scaled_reward": -0.19279681518673897, "rewards/format_reward": 0.18750000558793545, "step": 117 }, { "completion_length": 3584.0, "epoch": 0.050711791565941446, "grad_norm": 0.06273533403873444, "kl": 0.0045909881591796875, "learning_rate": 9.502373679810839e-07, "loss": 0.0002, "reward": -0.5388198830187321, "reward_std": 0.3540770895779133, "rewards/cosine_scaled_reward": -0.26940993778407574, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 3549.5208740234375, "epoch": 0.05114155251141553, "grad_norm": 0.1725703328847885, "kl": 0.0069904327392578125, "learning_rate": 9.487916106540465e-07, "loss": 0.0178, "reward": -0.3363003544509411, "reward_std": 0.6832092553377151, "rewards/cosine_scaled_reward": -0.20981684606522322, "rewards/format_reward": 0.0833333358168602, "step": 119 }, { "completion_length": 3489.1458435058594, "epoch": 0.0515713134568896, "grad_norm": 0.1931406855583191, "kl": 0.006278514862060547, "learning_rate": 9.473264167865171e-07, "loss": 0.0094, "reward": 0.020751532167196274, "reward_std": 0.7602911926805973, "rewards/cosine_scaled_reward": -0.10420757345855236, "rewards/format_reward": 0.2291666679084301, "step": 120 }, { "completion_length": 3486.1875, "epoch": 0.052001074402363685, "grad_norm": 0.1388317495584488, "kl": 0.011905670166015625, "learning_rate": 9.458418577899774e-07, "loss": 0.0272, "reward": -0.3421749100089073, "reward_std": 0.67668117582798, "rewards/cosine_scaled_reward": -0.22317078988999128, "rewards/format_reward": 0.10416666977107525, "step": 121 }, { "completion_length": 3119.4166870117188, "epoch": 0.05243083534783777, "grad_norm": 0.24518685042858124, "kl": 0.015209197998046875, "learning_rate": 9.443380060197385e-07, "loss": 0.0693, "reward": 0.13269708678126335, "reward_std": 0.8334100134670734, "rewards/cosine_scaled_reward": -0.11073479428887367, "rewards/format_reward": 0.3541666753590107, "step": 122 }, { "completion_length": 3445.437530517578, "epoch": 0.05286059629331184, "grad_norm": 0.09794328361749649, "kl": 0.007897377014160156, "learning_rate": 9.428149347714143e-07, "loss": 0.0285, "reward": -0.34195050224661827, "reward_std": 0.5295557156205177, "rewards/cosine_scaled_reward": -0.26472525601275265, "rewards/format_reward": 0.1875000074505806, "step": 123 }, { "completion_length": 3272.104248046875, "epoch": 0.053290357238785925, "grad_norm": 0.11964410543441772, "kl": 0.00942230224609375, "learning_rate": 9.412727182773486e-07, "loss": 0.0404, "reward": -0.22742752730846405, "reward_std": 0.6182866506278515, "rewards/cosine_scaled_reward": -0.22829709760844707, "rewards/format_reward": 0.22916666977107525, "step": 124 }, { "completion_length": 3352.375030517578, "epoch": 0.05372011818426001, "grad_norm": 0.13996753096580505, "kl": 0.01125335693359375, "learning_rate": 9.397114317029974e-07, "loss": 0.0174, "reward": -0.023607328534126282, "reward_std": 0.6397439278662205, "rewards/cosine_scaled_reward": -0.11597032606368884, "rewards/format_reward": 0.2083333358168602, "step": 125 }, { "completion_length": 3580.375, "epoch": 0.05414987912973408, "grad_norm": 0.09929011017084122, "kl": 0.007893562316894531, "learning_rate": 9.381311511432658e-07, "loss": 0.0025, "reward": -0.3609709031879902, "reward_std": 0.5754331611096859, "rewards/cosine_scaled_reward": -0.2117354478687048, "rewards/format_reward": 0.06250000186264515, "step": 126 }, { "completion_length": 3484.8333740234375, "epoch": 0.054579640075208165, "grad_norm": 0.16934768855571747, "kl": 0.009029388427734375, "learning_rate": 9.36531953618799e-07, "loss": 0.0381, "reward": -0.13774851895868778, "reward_std": 0.7257766649127007, "rewards/cosine_scaled_reward": -0.14179092459380627, "rewards/format_reward": 0.14583333767950535, "step": 127 }, { "completion_length": 3366.666717529297, "epoch": 0.05500940102068225, "grad_norm": 0.13089871406555176, "kl": 0.0181732177734375, "learning_rate": 9.34913917072228e-07, "loss": 0.0542, "reward": -0.1605115095153451, "reward_std": 0.6148231886327267, "rewards/cosine_scaled_reward": -0.18442242656601593, "rewards/format_reward": 0.2083333395421505, "step": 128 }, { "completion_length": 3269.125045776367, "epoch": 0.05543916196615632, "grad_norm": 0.2021060287952423, "kl": 0.006897926330566406, "learning_rate": 9.332771203643714e-07, "loss": 0.0526, "reward": 0.28505595680326223, "reward_std": 1.068858578801155, "rewards/cosine_scaled_reward": -0.0033053644001483917, "rewards/format_reward": 0.2916666716337204, "step": 129 }, { "completion_length": 3148.2708740234375, "epoch": 0.055868922911630405, "grad_norm": 0.11248363554477692, "kl": 0.010776519775390625, "learning_rate": 9.316216432703916e-07, "loss": 0.014, "reward": 0.003622163087129593, "reward_std": 0.521043318323791, "rewards/cosine_scaled_reward": -0.13360558450222015, "rewards/format_reward": 0.2708333395421505, "step": 130 }, { "completion_length": 3027.4583435058594, "epoch": 0.05629868385710449, "grad_norm": 0.08546566218137741, "kl": 0.01204681396484375, "learning_rate": 9.299475664759068e-07, "loss": 0.0252, "reward": -0.15484870062209666, "reward_std": 0.5217671655118465, "rewards/cosine_scaled_reward": -0.22325768577866256, "rewards/format_reward": 0.29166666977107525, "step": 131 }, { "completion_length": 3463.0208740234375, "epoch": 0.05672844480257857, "grad_norm": 0.16975514590740204, "kl": 0.010540008544921875, "learning_rate": 9.282549715730579e-07, "loss": 0.0532, "reward": -0.2093625129200518, "reward_std": 0.6891574300825596, "rewards/cosine_scaled_reward": -0.15676459204405546, "rewards/format_reward": 0.1041666679084301, "step": 132 }, { "completion_length": 3549.250030517578, "epoch": 0.057158205748052644, "grad_norm": 0.1109393835067749, "kl": 0.011675834655761719, "learning_rate": 9.265439410565328e-07, "loss": 0.0146, "reward": -0.4981108419597149, "reward_std": 0.5907085239887238, "rewards/cosine_scaled_reward": -0.2698887623846531, "rewards/format_reward": 0.0416666679084301, "step": 133 }, { "completion_length": 3238.500030517578, "epoch": 0.057587966693526726, "grad_norm": 0.14073127508163452, "kl": 0.011211395263671875, "learning_rate": 9.248145583195447e-07, "loss": 0.047, "reward": -0.04680259805172682, "reward_std": 0.6992146894335747, "rewards/cosine_scaled_reward": -0.17965130228549242, "rewards/format_reward": 0.31250000558793545, "step": 134 }, { "completion_length": 3542.8541870117188, "epoch": 0.05801772763900081, "grad_norm": 0.10204687714576721, "kl": 0.01340484619140625, "learning_rate": 9.230669076497687e-07, "loss": 0.0155, "reward": -0.40212307497859, "reward_std": 0.4828140716999769, "rewards/cosine_scaled_reward": -0.2323115412145853, "rewards/format_reward": 0.06250000186264515, "step": 135 }, { "completion_length": 3471.5833740234375, "epoch": 0.058447488584474884, "grad_norm": 0.15894722938537598, "kl": 0.01526641845703125, "learning_rate": 9.213010742252327e-07, "loss": 0.049, "reward": -0.18211988359689713, "reward_std": 0.5930627919733524, "rewards/cosine_scaled_reward": -0.14314327016472816, "rewards/format_reward": 0.1041666679084301, "step": 136 }, { "completion_length": 3584.0, "epoch": 0.058877249529948966, "grad_norm": 0.10171353071928024, "kl": 0.013339996337890625, "learning_rate": 9.195171441101668e-07, "loss": 0.0005, "reward": -0.3832127247005701, "reward_std": 0.5324176596477628, "rewards/cosine_scaled_reward": -0.21243969385977834, "rewards/format_reward": 0.0416666679084301, "step": 137 }, { "completion_length": 3437.0416870117188, "epoch": 0.05930701047542305, "grad_norm": 0.10177407413721085, "kl": 0.011548995971679688, "learning_rate": 9.177152042508077e-07, "loss": 0.026, "reward": -0.3453960847109556, "reward_std": 0.38140716776251793, "rewards/cosine_scaled_reward": -0.22478137537837029, "rewards/format_reward": 0.10416666977107525, "step": 138 }, { "completion_length": 3140.1667098999023, "epoch": 0.059736771420897124, "grad_norm": 0.12867099046707153, "kl": 0.014875411987304688, "learning_rate": 9.158953424711624e-07, "loss": 0.0377, "reward": 0.07128191366791725, "reward_std": 0.5858375579118729, "rewards/cosine_scaled_reward": -0.07894237898290157, "rewards/format_reward": 0.2291666679084301, "step": 139 }, { "completion_length": 3396.0416870117188, "epoch": 0.060166532366371206, "grad_norm": 0.16812600195407867, "kl": 0.013219833374023438, "learning_rate": 9.140576474687263e-07, "loss": 0.0249, "reward": 0.19550307467579842, "reward_std": 0.5796510288491845, "rewards/cosine_scaled_reward": 0.004001514986157417, "rewards/format_reward": 0.1875000074505806, "step": 140 }, { "completion_length": 3425.2916870117188, "epoch": 0.06059629331184529, "grad_norm": 0.07505161315202713, "kl": 0.013311386108398438, "learning_rate": 9.122022088101613e-07, "loss": 0.0136, "reward": -0.07056656666100025, "reward_std": 0.35567755810916424, "rewards/cosine_scaled_reward": -0.11861661169677973, "rewards/format_reward": 0.1666666679084301, "step": 141 }, { "completion_length": 3415.9791870117188, "epoch": 0.06102605425731936, "grad_norm": 0.11718086153268814, "kl": 0.008388519287109375, "learning_rate": 9.103291169269299e-07, "loss": 0.057, "reward": -0.19486494362354279, "reward_std": 0.6301777996122837, "rewards/cosine_scaled_reward": -0.14951580949127674, "rewards/format_reward": 0.1041666679084301, "step": 142 }, { "completion_length": 3578.7083435058594, "epoch": 0.061455815202793446, "grad_norm": 0.0752999410033226, "kl": 0.01139068603515625, "learning_rate": 9.084384631108882e-07, "loss": 0.0023, "reward": -0.5187571868300438, "reward_std": 0.3663038872182369, "rewards/cosine_scaled_reward": -0.26979526691138744, "rewards/format_reward": 0.02083333395421505, "step": 143 }, { "completion_length": 3468.7291870117188, "epoch": 0.06188557614826753, "grad_norm": 0.10373479872941971, "kl": 0.014919281005859375, "learning_rate": 9.065303395098358e-07, "loss": 0.0333, "reward": -0.2932019904255867, "reward_std": 0.4758964329957962, "rewards/cosine_scaled_reward": -0.18826765939593315, "rewards/format_reward": 0.0833333358168602, "step": 144 }, { "completion_length": 3497.0, "epoch": 0.0623153370937416, "grad_norm": 0.10871489346027374, "kl": 0.01686859130859375, "learning_rate": 9.046048391230247e-07, "loss": 0.02, "reward": -0.3449946716427803, "reward_std": 0.5674018263816833, "rewards/cosine_scaled_reward": -0.2349973302334547, "rewards/format_reward": 0.1250000037252903, "step": 145 }, { "completion_length": 3328.5625, "epoch": 0.06274509803921569, "grad_norm": 0.07163377106189728, "kl": 0.0128173828125, "learning_rate": 9.026620557966279e-07, "loss": 0.0017, "reward": -0.37629315070807934, "reward_std": 0.3654390387237072, "rewards/cosine_scaled_reward": -0.2610632386058569, "rewards/format_reward": 0.14583333395421505, "step": 146 }, { "completion_length": 3459.5625, "epoch": 0.06317485898468976, "grad_norm": 0.06434406340122223, "kl": 0.015018463134765625, "learning_rate": 9.007020842191634e-07, "loss": 0.004, "reward": -0.37178758159279823, "reward_std": 0.36646568775177, "rewards/cosine_scaled_reward": -0.23797712102532387, "rewards/format_reward": 0.1041666716337204, "step": 147 }, { "completion_length": 3433.5, "epoch": 0.06360461993016385, "grad_norm": 0.10527411848306656, "kl": 0.013702392578125, "learning_rate": 8.987250199168808e-07, "loss": 0.0339, "reward": -0.04528743866831064, "reward_std": 0.5125896688550711, "rewards/cosine_scaled_reward": -0.10597705328837037, "rewards/format_reward": 0.1666666716337204, "step": 148 }, { "completion_length": 3379.1666870117188, "epoch": 0.06403438087563793, "grad_norm": 0.10338223725557327, "kl": 0.017597198486328125, "learning_rate": 8.967309592491052e-07, "loss": 0.0121, "reward": -0.1979610212147236, "reward_std": 0.5748166777193546, "rewards/cosine_scaled_reward": -0.21356384828686714, "rewards/format_reward": 0.22916666977107525, "step": 149 }, { "completion_length": 3461.687530517578, "epoch": 0.064464141821112, "grad_norm": 0.07124616950750351, "kl": 0.01663970947265625, "learning_rate": 8.9471999940354e-07, "loss": 0.0114, "reward": -0.32599183544516563, "reward_std": 0.3998993746936321, "rewards/cosine_scaled_reward": -0.23591258190572262, "rewards/format_reward": 0.14583333767950535, "step": 150 }, { "completion_length": 3246.6458587646484, "epoch": 0.06489390276658609, "grad_norm": 0.1280849128961563, "kl": 0.011165618896484375, "learning_rate": 8.926922383915315e-07, "loss": 0.0501, "reward": 0.001697378233075142, "reward_std": 0.7034376226365566, "rewards/cosine_scaled_reward": -0.11373465570795815, "rewards/format_reward": 0.2291666753590107, "step": 151 }, { "completion_length": 3058.3958892822266, "epoch": 0.06532366371206016, "grad_norm": 0.14417125284671783, "kl": 0.018573760986328125, "learning_rate": 8.906477750432903e-07, "loss": 0.0761, "reward": -0.11197322607040405, "reward_std": 0.6637962535023689, "rewards/cosine_scaled_reward": -0.17056994792073965, "rewards/format_reward": 0.22916667349636555, "step": 152 }, { "completion_length": 3426.625030517578, "epoch": 0.06575342465753424, "grad_norm": 0.1690993309020996, "kl": 0.022373199462890625, "learning_rate": 8.88586709003076e-07, "loss": 0.0454, "reward": -0.2943109832704067, "reward_std": 0.5692493468523026, "rewards/cosine_scaled_reward": -0.19923883862793446, "rewards/format_reward": 0.10416666977107525, "step": 153 }, { "completion_length": 3132.2500228881836, "epoch": 0.06618318560300833, "grad_norm": 0.2893940210342407, "kl": 0.015697479248046875, "learning_rate": 8.865091407243394e-07, "loss": 0.0559, "reward": -0.031232688575983047, "reward_std": 0.7150415852665901, "rewards/cosine_scaled_reward": -0.10936635360121727, "rewards/format_reward": 0.18750000186264515, "step": 154 }, { "completion_length": 3532.8125, "epoch": 0.0666129465484824, "grad_norm": 0.0997319146990776, "kl": 0.01625823974609375, "learning_rate": 8.844151714648274e-07, "loss": -0.0078, "reward": -0.34672408178448677, "reward_std": 0.5103256516158581, "rewards/cosine_scaled_reward": -0.20461204182356596, "rewards/format_reward": 0.0625, "step": 155 }, { "completion_length": 3299.312515258789, "epoch": 0.06704270749395648, "grad_norm": 0.11455141752958298, "kl": 0.016597747802734375, "learning_rate": 8.823049032816478e-07, "loss": 0.0434, "reward": -0.14216446783393621, "reward_std": 0.5931817293167114, "rewards/cosine_scaled_reward": -0.20649890042841434, "rewards/format_reward": 0.2708333395421505, "step": 156 }, { "completion_length": 3074.3750228881836, "epoch": 0.06747246843943057, "grad_norm": 0.12548862397670746, "kl": 0.017139434814453125, "learning_rate": 8.801784390262943e-07, "loss": 0.0272, "reward": 0.16711216093972325, "reward_std": 0.6276020519435406, "rewards/cosine_scaled_reward": -0.06227726023644209, "rewards/format_reward": 0.29166666977107525, "step": 157 }, { "completion_length": 3308.0000610351562, "epoch": 0.06790222938490464, "grad_norm": 0.18085293471813202, "kl": 0.022403717041015625, "learning_rate": 8.780358823396352e-07, "loss": 0.0558, "reward": 0.17560554668307304, "reward_std": 0.8071559220552444, "rewards/cosine_scaled_reward": -0.07886389270424843, "rewards/format_reward": 0.3333333432674408, "step": 158 }, { "completion_length": 2899.2083435058594, "epoch": 0.06833199033037873, "grad_norm": 0.12107068300247192, "kl": 0.022735595703125, "learning_rate": 8.758773376468604e-07, "loss": 0.022, "reward": 0.05124075151979923, "reward_std": 0.6395381707698107, "rewards/cosine_scaled_reward": -0.16187963029369712, "rewards/format_reward": 0.37500000186264515, "step": 159 }, { "completion_length": 3469.0208740234375, "epoch": 0.06876175127585281, "grad_norm": 0.11639764159917831, "kl": 0.022029876708984375, "learning_rate": 8.737029101523929e-07, "loss": 0.0189, "reward": -0.18369666393846273, "reward_std": 0.5852576978504658, "rewards/cosine_scaled_reward": -0.1855983359273523, "rewards/format_reward": 0.1875000037252903, "step": 160 }, { "completion_length": 3441.5000610351562, "epoch": 0.06919151222132688, "grad_norm": 0.18354545533657074, "kl": 0.01715087890625, "learning_rate": 8.715127058347614e-07, "loss": 0.0503, "reward": -0.11303183436393738, "reward_std": 0.8645796738564968, "rewards/cosine_scaled_reward": -0.13984924770193174, "rewards/format_reward": 0.1666666716337204, "step": 161 }, { "completion_length": 3400.812530517578, "epoch": 0.06962127316680097, "grad_norm": 0.07651402801275253, "kl": 0.01921844482421875, "learning_rate": 8.693068314414344e-07, "loss": 0.0064, "reward": -0.11796816065907478, "reward_std": 0.38571254536509514, "rewards/cosine_scaled_reward": -0.1319007594138384, "rewards/format_reward": 0.14583333395421505, "step": 162 }, { "completion_length": 3556.5208435058594, "epoch": 0.07005103411227505, "grad_norm": 0.1292174607515335, "kl": 0.013072967529296875, "learning_rate": 8.670853944836176e-07, "loss": 0.0169, "reward": -0.288902822881937, "reward_std": 0.6288706846535206, "rewards/cosine_scaled_reward": -0.18611807376146317, "rewards/format_reward": 0.0833333358168602, "step": 163 }, { "completion_length": 3297.3333740234375, "epoch": 0.07048079505774912, "grad_norm": 0.10644536465406418, "kl": 0.0239410400390625, "learning_rate": 8.648485032310144e-07, "loss": 0.0469, "reward": -0.20694768306566402, "reward_std": 0.6061897035688162, "rewards/cosine_scaled_reward": -0.21805717796087265, "rewards/format_reward": 0.22916667349636555, "step": 164 }, { "completion_length": 3055.5833892822266, "epoch": 0.07091055600322321, "grad_norm": 0.17674364149570465, "kl": 0.019815444946289062, "learning_rate": 8.625962667065487e-07, "loss": 0.0428, "reward": 0.19229037687182426, "reward_std": 0.6327087804675102, "rewards/cosine_scaled_reward": -0.049688142258673906, "rewards/format_reward": 0.2916666716337204, "step": 165 }, { "completion_length": 3256.6458435058594, "epoch": 0.07134031694869729, "grad_norm": 0.1690843403339386, "kl": 0.0176544189453125, "learning_rate": 8.603287946810513e-07, "loss": 0.0721, "reward": 0.14985787402838469, "reward_std": 0.6725077964365482, "rewards/cosine_scaled_reward": -0.05007106252014637, "rewards/format_reward": 0.25000000931322575, "step": 166 }, { "completion_length": 2920.354217529297, "epoch": 0.07177007789417136, "grad_norm": 0.10734570026397705, "kl": 0.0175933837890625, "learning_rate": 8.580461976679099e-07, "loss": 0.028, "reward": 0.30014135129749775, "reward_std": 0.6654414497315884, "rewards/cosine_scaled_reward": -0.07909599598497152, "rewards/format_reward": 0.4583333358168602, "step": 167 }, { "completion_length": 3429.7291870117188, "epoch": 0.07219983883964545, "grad_norm": 0.14779284596443176, "kl": 0.023284912109375, "learning_rate": 8.557485869176825e-07, "loss": 0.0251, "reward": -0.0562932500615716, "reward_std": 0.6552736014127731, "rewards/cosine_scaled_reward": -0.11147995479404926, "rewards/format_reward": 0.1666666679084301, "step": 168 }, { "completion_length": 3254.5, "epoch": 0.07262959978511953, "grad_norm": 0.13204683363437653, "kl": 0.02100372314453125, "learning_rate": 8.534360744126753e-07, "loss": 0.0134, "reward": -0.11221800139173865, "reward_std": 0.642895121127367, "rewards/cosine_scaled_reward": -0.1602756674401462, "rewards/format_reward": 0.2083333358168602, "step": 169 }, { "completion_length": 3294.6458435058594, "epoch": 0.0730593607305936, "grad_norm": 0.21877852082252502, "kl": 0.01103973388671875, "learning_rate": 8.511087728614862e-07, "loss": 0.0537, "reward": 0.2626834763213992, "reward_std": 0.8678100481629372, "rewards/cosine_scaled_reward": -0.056158279068768024, "rewards/format_reward": 0.3750000074505806, "step": 170 }, { "completion_length": 3159.9583435058594, "epoch": 0.07348912167606769, "grad_norm": 0.13359946012496948, "kl": 0.02030181884765625, "learning_rate": 8.487667956935087e-07, "loss": 0.0307, "reward": -0.12274059094488621, "reward_std": 0.7331548109650612, "rewards/cosine_scaled_reward": -0.16553696274058893, "rewards/format_reward": 0.2083333358168602, "step": 171 }, { "completion_length": 3013.041702270508, "epoch": 0.07391888262154177, "grad_norm": 0.16084308922290802, "kl": 0.02013397216796875, "learning_rate": 8.464102570534061e-07, "loss": 0.0222, "reward": 0.36476942151784897, "reward_std": 0.7891103178262711, "rewards/cosine_scaled_reward": 0.005301397293806076, "rewards/format_reward": 0.35416667722165585, "step": 172 }, { "completion_length": 3562.6458435058594, "epoch": 0.07434864356701584, "grad_norm": 0.12003089487552643, "kl": 0.0226898193359375, "learning_rate": 8.440392717955475e-07, "loss": 0.0109, "reward": -0.37344493716955185, "reward_std": 0.5199102722108364, "rewards/cosine_scaled_reward": -0.21797246765345335, "rewards/format_reward": 0.06250000186264515, "step": 173 }, { "completion_length": 3133.875030517578, "epoch": 0.07477840451248993, "grad_norm": 0.11335734277963638, "kl": 0.02759552001953125, "learning_rate": 8.416539554784089e-07, "loss": -0.024, "reward": 0.32874174043536186, "reward_std": 0.6159818805754185, "rewards/cosine_scaled_reward": -0.012712458148598671, "rewards/format_reward": 0.3541666716337204, "step": 174 }, { "completion_length": 3315.3333435058594, "epoch": 0.07520816545796401, "grad_norm": 0.11656147241592407, "kl": 0.024517059326171875, "learning_rate": 8.392544243589427e-07, "loss": 0.0141, "reward": -0.06426817551255226, "reward_std": 0.5298474039882421, "rewards/cosine_scaled_reward": -0.14671742729842663, "rewards/format_reward": 0.22916666977107525, "step": 175 }, { "completion_length": 3306.4583435058594, "epoch": 0.07563792640343808, "grad_norm": 0.06582502275705338, "kl": 0.021160125732421875, "learning_rate": 8.368407953869103e-07, "loss": 0.0017, "reward": -0.3400796912610531, "reward_std": 0.36345600709319115, "rewards/cosine_scaled_reward": -0.24295651353895664, "rewards/format_reward": 0.14583333395421505, "step": 176 }, { "completion_length": 2980.3541870117188, "epoch": 0.07606768734891217, "grad_norm": 0.23307356238365173, "kl": 0.0188446044921875, "learning_rate": 8.344131861991828e-07, "loss": 0.0602, "reward": 0.1728641726076603, "reward_std": 0.7631958788260818, "rewards/cosine_scaled_reward": -0.06981792487204075, "rewards/format_reward": 0.3125000037252903, "step": 177 }, { "completion_length": 3336.979217529297, "epoch": 0.07649744829438625, "grad_norm": 0.2260599583387375, "kl": 0.025699615478515625, "learning_rate": 8.319717151140072e-07, "loss": 0.0557, "reward": 0.365965761244297, "reward_std": 0.9702694937586784, "rewards/cosine_scaled_reward": 0.04756622388958931, "rewards/format_reward": 0.2708333395421505, "step": 178 }, { "completion_length": 3029.500015258789, "epoch": 0.07692720923986032, "grad_norm": 0.08061809092760086, "kl": 0.03124237060546875, "learning_rate": 8.295165011252396e-07, "loss": 0.0184, "reward": 0.06244013726245612, "reward_std": 0.4294763244688511, "rewards/cosine_scaled_reward": -0.10419660620391369, "rewards/format_reward": 0.2708333358168602, "step": 179 }, { "completion_length": 3410.500030517578, "epoch": 0.07735697018533441, "grad_norm": 0.16618837416172028, "kl": 0.033294677734375, "learning_rate": 8.270476638965461e-07, "loss": 0.0369, "reward": -0.1474200263619423, "reward_std": 0.6390394419431686, "rewards/cosine_scaled_reward": -0.16746002063155174, "rewards/format_reward": 0.1875000074505806, "step": 180 }, { "completion_length": 3482.312530517578, "epoch": 0.07778673113080849, "grad_norm": 0.07122974842786789, "kl": 0.0244140625, "learning_rate": 8.245653237555705e-07, "loss": 0.0148, "reward": -0.4937465451657772, "reward_std": 0.3878455087542534, "rewards/cosine_scaled_reward": -0.28853994235396385, "rewards/format_reward": 0.08333333395421505, "step": 181 }, { "completion_length": 3584.0, "epoch": 0.07821649207628256, "grad_norm": 0.08226907253265381, "kl": 0.0230560302734375, "learning_rate": 8.220696016880687e-07, "loss": 0.0009, "reward": -0.36028627678751945, "reward_std": 0.38098668679594994, "rewards/cosine_scaled_reward": -0.19055980537086725, "rewards/format_reward": 0.02083333395421505, "step": 182 }, { "completion_length": 2955.9583587646484, "epoch": 0.07864625302175665, "grad_norm": 0.16292187571525574, "kl": 0.02046966552734375, "learning_rate": 8.195606193320136e-07, "loss": 0.0543, "reward": 0.2583068711683154, "reward_std": 0.8180742636322975, "rewards/cosine_scaled_reward": -0.0687632355839014, "rewards/format_reward": 0.39583333767950535, "step": 183 }, { "completion_length": 3421.3541870117188, "epoch": 0.07907601396723073, "grad_norm": 0.18205097317695618, "kl": 0.033050537109375, "learning_rate": 8.170384989716657e-07, "loss": 0.0294, "reward": -0.08247991651296616, "reward_std": 0.6932689230889082, "rewards/cosine_scaled_reward": -0.16623996291309595, "rewards/format_reward": 0.25000000186264515, "step": 184 }, { "completion_length": 3399.375030517578, "epoch": 0.0795057749127048, "grad_norm": 0.13265007734298706, "kl": 0.0217742919921875, "learning_rate": 8.145033635316128e-07, "loss": 0.0283, "reward": 0.12451641820371151, "reward_std": 0.7071679830551147, "rewards/cosine_scaled_reward": -0.06274178437888622, "rewards/format_reward": 0.2500000074505806, "step": 185 }, { "completion_length": 3282.5834045410156, "epoch": 0.07993553585817889, "grad_norm": 0.2148338407278061, "kl": 0.03296661376953125, "learning_rate": 8.119553365707802e-07, "loss": 0.0775, "reward": -0.04748889245092869, "reward_std": 0.8150719180703163, "rewards/cosine_scaled_reward": -0.16957777948118746, "rewards/format_reward": 0.2916666716337204, "step": 186 }, { "completion_length": 3325.9375610351562, "epoch": 0.08036529680365297, "grad_norm": 0.10358522087335587, "kl": 0.03009033203125, "learning_rate": 8.093945422764069e-07, "loss": 0.0241, "reward": -0.2459661802276969, "reward_std": 0.49346040561795235, "rewards/cosine_scaled_reward": -0.2063164236024022, "rewards/format_reward": 0.16666666977107525, "step": 187 }, { "completion_length": 3092.062530517578, "epoch": 0.08079505774912704, "grad_norm": 0.14302615821361542, "kl": 0.02515411376953125, "learning_rate": 8.068211054579943e-07, "loss": 0.0396, "reward": 0.24111286364495754, "reward_std": 0.7264149412512779, "rewards/cosine_scaled_reward": -0.06694357469677925, "rewards/format_reward": 0.3750000111758709, "step": 188 }, { "completion_length": 3315.125030517578, "epoch": 0.08122481869460113, "grad_norm": 0.17394718527793884, "kl": 0.033294677734375, "learning_rate": 8.04235151541222e-07, "loss": -0.0137, "reward": -0.10913177952170372, "reward_std": 0.5816879943013191, "rewards/cosine_scaled_reward": -0.1587325558066368, "rewards/format_reward": 0.2083333358168602, "step": 189 }, { "completion_length": 3173.916702270508, "epoch": 0.0816545796400752, "grad_norm": 0.14527945220470428, "kl": 0.023956298828125, "learning_rate": 8.01636806561836e-07, "loss": 0.0211, "reward": 0.14928979286924005, "reward_std": 0.6108090840280056, "rewards/cosine_scaled_reward": -0.0920217726379633, "rewards/format_reward": 0.3333333469927311, "step": 190 }, { "completion_length": 3545.312530517578, "epoch": 0.08208434058554928, "grad_norm": 0.10277193784713745, "kl": 0.0347137451171875, "learning_rate": 7.990261971595048e-07, "loss": 0.0068, "reward": -0.3592158183455467, "reward_std": 0.4098637104034424, "rewards/cosine_scaled_reward": -0.2108579110354185, "rewards/format_reward": 0.06250000186264515, "step": 191 }, { "completion_length": 3386.5, "epoch": 0.08251410153102337, "grad_norm": 0.09498348832130432, "kl": 0.03261566162109375, "learning_rate": 7.964034505716476e-07, "loss": 0.0115, "reward": 0.18360814824700356, "reward_std": 0.467612624168396, "rewards/cosine_scaled_reward": -0.033195920288562775, "rewards/format_reward": 0.25000000558793545, "step": 192 }, { "completion_length": 3572.2291870117188, "epoch": 0.08294386247649745, "grad_norm": 0.16400958597660065, "kl": 0.02462005615234375, "learning_rate": 7.93768694627233e-07, "loss": 0.01, "reward": -0.23704488668590784, "reward_std": 0.8231727816164494, "rewards/cosine_scaled_reward": -0.18102245219051838, "rewards/format_reward": 0.1250000037252903, "step": 193 }, { "completion_length": 3437.479217529297, "epoch": 0.08337362342197153, "grad_norm": 0.1961127370595932, "kl": 0.03478240966796875, "learning_rate": 7.911220577405484e-07, "loss": 0.0444, "reward": -0.1177384490147233, "reward_std": 0.8588898852467537, "rewards/cosine_scaled_reward": -0.15261922194622457, "rewards/format_reward": 0.18750000558793545, "step": 194 }, { "completion_length": 3180.2708587646484, "epoch": 0.08380338436744561, "grad_norm": 0.13432663679122925, "kl": 0.029193878173828125, "learning_rate": 7.884636689049422e-07, "loss": 0.0311, "reward": -0.008240756345912814, "reward_std": 0.62161635607481, "rewards/cosine_scaled_reward": -0.18120371364057064, "rewards/format_reward": 0.3541666716337204, "step": 195 }, { "completion_length": 3555.312530517578, "epoch": 0.08423314531291969, "grad_norm": 0.14075148105621338, "kl": 0.0388946533203125, "learning_rate": 7.857936576865356e-07, "loss": 0.0136, "reward": -0.2789467005059123, "reward_std": 0.6226627379655838, "rewards/cosine_scaled_reward": -0.18114001862704754, "rewards/format_reward": 0.0833333358168602, "step": 196 }, { "completion_length": 3444.2708740234375, "epoch": 0.08466290625839377, "grad_norm": 0.13743709027767181, "kl": 0.0413360595703125, "learning_rate": 7.831121542179086e-07, "loss": 0.0174, "reward": -0.20011738629546016, "reward_std": 0.5709670521318913, "rewards/cosine_scaled_reward": -0.1938086934387684, "rewards/format_reward": 0.18750000558793545, "step": 197 }, { "completion_length": 3181.0833587646484, "epoch": 0.08509266720386785, "grad_norm": 0.1622048020362854, "kl": 0.0224761962890625, "learning_rate": 7.804192891917571e-07, "loss": 0.0201, "reward": 0.13934148475527763, "reward_std": 0.5416624695062637, "rewards/cosine_scaled_reward": -0.07616259157657623, "rewards/format_reward": 0.2916666716337204, "step": 198 }, { "completion_length": 3388.4166870117188, "epoch": 0.08552242814934193, "grad_norm": 0.1238652840256691, "kl": 0.030548095703125, "learning_rate": 7.777151938545235e-07, "loss": 0.0276, "reward": -0.2362179271876812, "reward_std": 0.6051121056079865, "rewards/cosine_scaled_reward": -0.23269230965524912, "rewards/format_reward": 0.2291666716337204, "step": 199 }, { "completion_length": 3429.812530517578, "epoch": 0.08595218909481601, "grad_norm": 0.09556426852941513, "kl": 0.034912109375, "learning_rate": 7.75e-07, "loss": 0.0204, "reward": -0.33206217363476753, "reward_std": 0.40821436420083046, "rewards/cosine_scaled_reward": -0.22853108774870634, "rewards/format_reward": 0.12500000558793545, "step": 200 }, { "completion_length": 3193.6458740234375, "epoch": 0.08638195004029009, "grad_norm": 0.17243292927742004, "kl": 0.0253143310546875, "learning_rate": 7.72273839962904e-07, "loss": 0.0582, "reward": 0.2539176023565233, "reward_std": 0.7869996652007103, "rewards/cosine_scaled_reward": -0.05012453440576792, "rewards/format_reward": 0.35416667722165585, "step": 201 }, { "completion_length": 3550.7708435058594, "epoch": 0.08681171098576417, "grad_norm": 0.13687381148338318, "kl": 0.03179168701171875, "learning_rate": 7.695368466124296e-07, "loss": 0.0017, "reward": -0.3242543153464794, "reward_std": 0.5061924643814564, "rewards/cosine_scaled_reward": -0.23504382057581097, "rewards/format_reward": 0.1458333395421505, "step": 202 }, { "completion_length": 3194.7083740234375, "epoch": 0.08724147193123825, "grad_norm": 0.12136410176753998, "kl": 0.039093017578125, "learning_rate": 7.667891533457718e-07, "loss": 0.0159, "reward": 0.29038818180561066, "reward_std": 0.5406546741724014, "rewards/cosine_scaled_reward": -0.011055925861001015, "rewards/format_reward": 0.3125000074505806, "step": 203 }, { "completion_length": 3402.0208435058594, "epoch": 0.08767123287671233, "grad_norm": 0.11410660296678543, "kl": 0.0285186767578125, "learning_rate": 7.640308940816239e-07, "loss": 0.0309, "reward": -0.2242907714098692, "reward_std": 0.5899740010499954, "rewards/cosine_scaled_reward": -0.1850620498880744, "rewards/format_reward": 0.14583333767950535, "step": 204 }, { "completion_length": 3179.812545776367, "epoch": 0.0881009938221864, "grad_norm": 0.18997453153133392, "kl": 0.04427337646484375, "learning_rate": 7.612622032536507e-07, "loss": 0.0314, "reward": 0.05087480694055557, "reward_std": 0.48003071919083595, "rewards/cosine_scaled_reward": -0.12039593700319529, "rewards/format_reward": 0.2916666679084301, "step": 205 }, { "completion_length": 3207.0208740234375, "epoch": 0.0885307547676605, "grad_norm": 0.407353013753891, "kl": 0.03714752197265625, "learning_rate": 7.584832158039378e-07, "loss": 0.0519, "reward": 0.3597016856074333, "reward_std": 0.7916782014071941, "rewards/cosine_scaled_reward": -0.02848249551607296, "rewards/format_reward": 0.416666679084301, "step": 206 }, { "completion_length": 3297.7709045410156, "epoch": 0.08896051571313457, "grad_norm": 0.2145596593618393, "kl": 0.046661376953125, "learning_rate": 7.556940671764124e-07, "loss": 0.0572, "reward": 0.3155039772391319, "reward_std": 0.7965943478047848, "rewards/cosine_scaled_reward": 0.0015019848942756653, "rewards/format_reward": 0.31250000931322575, "step": 207 }, { "completion_length": 3048.9583587646484, "epoch": 0.08939027665860864, "grad_norm": 0.13097485899925232, "kl": 0.03200531005859375, "learning_rate": 7.528948933102438e-07, "loss": 0.033, "reward": 0.6474989429116249, "reward_std": 0.708543311804533, "rewards/cosine_scaled_reward": 0.10499945655465126, "rewards/format_reward": 0.43750000186264515, "step": 208 }, { "completion_length": 3385.9166870117188, "epoch": 0.08982003760408273, "grad_norm": 0.15453311800956726, "kl": 0.0509185791015625, "learning_rate": 7.500858306332172e-07, "loss": 0.0285, "reward": -0.0658284134697169, "reward_std": 0.6442733351141214, "rewards/cosine_scaled_reward": -0.12666420824825764, "rewards/format_reward": 0.1875000074505806, "step": 209 }, { "completion_length": 3579.5208435058594, "epoch": 0.09024979854955681, "grad_norm": 0.15672118961811066, "kl": 0.0450897216796875, "learning_rate": 7.472670160550848e-07, "loss": 0.0046, "reward": -0.3182646604254842, "reward_std": 0.7156099434942007, "rewards/cosine_scaled_reward": -0.19038232951425016, "rewards/format_reward": 0.06250000186264515, "step": 210 }, { "completion_length": 2986.4166717529297, "epoch": 0.09067955949503088, "grad_norm": 0.16068172454833984, "kl": 0.03653717041015625, "learning_rate": 7.444385869608921e-07, "loss": 0.0312, "reward": 0.007152508944272995, "reward_std": 0.6881944872438908, "rewards/cosine_scaled_reward": -0.1630904166959226, "rewards/format_reward": 0.3333333395421505, "step": 211 }, { "completion_length": 2977.2708587646484, "epoch": 0.09110932044050497, "grad_norm": 0.19748130440711975, "kl": 0.047576904296875, "learning_rate": 7.416006812042827e-07, "loss": 0.0352, "reward": 0.2155323214828968, "reward_std": 0.6074364297091961, "rewards/cosine_scaled_reward": -0.1005671825259924, "rewards/format_reward": 0.41666667722165585, "step": 212 }, { "completion_length": 3140.437530517578, "epoch": 0.09153908138597905, "grad_norm": 0.19864711165428162, "kl": 0.0439605712890625, "learning_rate": 7.387534371007797e-07, "loss": 0.0512, "reward": 0.09519186615943909, "reward_std": 0.6753976568579674, "rewards/cosine_scaled_reward": -0.12948740273714066, "rewards/format_reward": 0.35416667722165585, "step": 213 }, { "completion_length": 3523.1458740234375, "epoch": 0.09196884233145312, "grad_norm": 0.1615707278251648, "kl": 0.0407257080078125, "learning_rate": 7.358969934210438e-07, "loss": 0.0165, "reward": -0.24203333631157875, "reward_std": 0.6806427016854286, "rewards/cosine_scaled_reward": -0.22518333233892918, "rewards/format_reward": 0.20833333767950535, "step": 214 }, { "completion_length": 3129.4583740234375, "epoch": 0.09239860327692721, "grad_norm": 0.25122833251953125, "kl": 0.0523834228515625, "learning_rate": 7.330314893841101e-07, "loss": 0.0522, "reward": 0.2533303890377283, "reward_std": 0.6469140015542507, "rewards/cosine_scaled_reward": -0.019168131984770298, "rewards/format_reward": 0.2916666716337204, "step": 215 }, { "completion_length": 3257.7708435058594, "epoch": 0.09282836422240129, "grad_norm": 0.0941159650683403, "kl": 0.04681396484375, "learning_rate": 7.301570646506027e-07, "loss": 0.0095, "reward": -0.3436866719275713, "reward_std": 0.3506512399762869, "rewards/cosine_scaled_reward": -0.24476001039147377, "rewards/format_reward": 0.14583333395421505, "step": 216 }, { "completion_length": 3445.1458740234375, "epoch": 0.09325812516787536, "grad_norm": 0.25555482506752014, "kl": 0.058013916015625, "learning_rate": 7.27273859315928e-07, "loss": 0.0541, "reward": 0.03174871392548084, "reward_std": 0.9210008624941111, "rewards/cosine_scaled_reward": -0.07787565630860627, "rewards/format_reward": 0.18750000186264515, "step": 217 }, { "completion_length": 3249.3125610351562, "epoch": 0.09368788611334945, "grad_norm": 0.20539967715740204, "kl": 0.0483856201171875, "learning_rate": 7.243820139034464e-07, "loss": 0.0299, "reward": 0.11185483355075121, "reward_std": 0.7668456397950649, "rewards/cosine_scaled_reward": -0.07948926091194153, "rewards/format_reward": 0.27083333395421505, "step": 218 }, { "completion_length": 3401.4583740234375, "epoch": 0.09411764705882353, "grad_norm": 0.1412569135427475, "kl": 0.049468994140625, "learning_rate": 7.214816693576234e-07, "loss": 0.0011, "reward": -0.23505458794534206, "reward_std": 0.627603467553854, "rewards/cosine_scaled_reward": -0.20086063025519252, "rewards/format_reward": 0.1666666716337204, "step": 219 }, { "completion_length": 3502.687530517578, "epoch": 0.0945474080042976, "grad_norm": 0.1777493804693222, "kl": 0.066680908203125, "learning_rate": 7.185729670371604e-07, "loss": 0.0189, "reward": -0.19496820122003555, "reward_std": 0.7111810259521008, "rewards/cosine_scaled_reward": -0.17040077410638332, "rewards/format_reward": 0.14583333395421505, "step": 220 }, { "completion_length": 3412.5416870117188, "epoch": 0.09497716894977169, "grad_norm": 0.1054745763540268, "kl": 0.057647705078125, "learning_rate": 7.156560487081051e-07, "loss": 0.0061, "reward": -0.2897605150938034, "reward_std": 0.4697086103260517, "rewards/cosine_scaled_reward": -0.24904693104326725, "rewards/format_reward": 0.20833333395421505, "step": 221 }, { "completion_length": 3412.7083435058594, "epoch": 0.09540692989524577, "grad_norm": 0.18695291876792908, "kl": 0.0538787841796875, "learning_rate": 7.127310565369415e-07, "loss": 0.06, "reward": -0.3170203072950244, "reward_std": 0.6680502519011497, "rewards/cosine_scaled_reward": -0.1897601610980928, "rewards/format_reward": 0.06250000186264515, "step": 222 }, { "completion_length": 3207.8125610351562, "epoch": 0.09583669084071984, "grad_norm": 0.5077880024909973, "kl": 0.058624267578125, "learning_rate": 7.097981330836616e-07, "loss": 0.1191, "reward": 0.059028610587120056, "reward_std": 0.8817252814769745, "rewards/cosine_scaled_reward": -0.09548570215702057, "rewards/format_reward": 0.25000000558793545, "step": 223 }, { "completion_length": 3346.5, "epoch": 0.09626645178619393, "grad_norm": 0.3529908359050751, "kl": 0.0791015625, "learning_rate": 7.068574212948169e-07, "loss": 0.0662, "reward": -0.16201750189065933, "reward_std": 0.6817200817167759, "rewards/cosine_scaled_reward": -0.19559208862483501, "rewards/format_reward": 0.2291666716337204, "step": 224 }, { "completion_length": 3531.1875, "epoch": 0.09669621273166801, "grad_norm": 0.11196991056203842, "kl": 0.0760345458984375, "learning_rate": 7.039090644965509e-07, "loss": 0.0223, "reward": -0.3526093331165612, "reward_std": 0.40742000564932823, "rewards/cosine_scaled_reward": -0.19713799748569727, "rewards/format_reward": 0.0416666679084301, "step": 225 }, { "completion_length": 3225.229217529297, "epoch": 0.09712597367714208, "grad_norm": 0.165104478597641, "kl": 0.067474365234375, "learning_rate": 7.009532063876148e-07, "loss": 0.027, "reward": -0.1772719812579453, "reward_std": 0.6228863187134266, "rewards/cosine_scaled_reward": -0.18238599598407745, "rewards/format_reward": 0.18750000186264515, "step": 226 }, { "completion_length": 2982.6458892822266, "epoch": 0.09755573462261617, "grad_norm": 0.2126280665397644, "kl": 0.08245849609375, "learning_rate": 6.979899910323624e-07, "loss": 0.0327, "reward": 0.09923556400462985, "reward_std": 0.5942269749939442, "rewards/cosine_scaled_reward": -0.11704888939857483, "rewards/format_reward": 0.3333333395421505, "step": 227 }, { "completion_length": 2873.6875762939453, "epoch": 0.09798549556809025, "grad_norm": 0.30571866035461426, "kl": 0.0928955078125, "learning_rate": 6.950195628537299e-07, "loss": 0.0545, "reward": 0.2864613034762442, "reward_std": 0.820480864495039, "rewards/cosine_scaled_reward": -0.075519357342273, "rewards/format_reward": 0.43750000931322575, "step": 228 }, { "completion_length": 3080.8334045410156, "epoch": 0.09841525651356434, "grad_norm": 0.17195257544517517, "kl": 0.0755615234375, "learning_rate": 6.920420666261961e-07, "loss": 0.0329, "reward": -0.09409035369753838, "reward_std": 0.607878141105175, "rewards/cosine_scaled_reward": -0.18246185244061053, "rewards/format_reward": 0.2708333395421505, "step": 229 }, { "completion_length": 2944.4375228881836, "epoch": 0.09884501745903841, "grad_norm": 0.14921163022518158, "kl": 0.10565185546875, "learning_rate": 6.890576474687263e-07, "loss": 0.0077, "reward": -0.20375279360450804, "reward_std": 0.43036815896630287, "rewards/cosine_scaled_reward": -0.26854306645691395, "rewards/format_reward": 0.3333333358168602, "step": 230 }, { "completion_length": 2954.0833435058594, "epoch": 0.09927477840451249, "grad_norm": 0.2767171561717987, "kl": 0.15283203125, "learning_rate": 6.860664508377001e-07, "loss": 0.0295, "reward": -0.15808595577254891, "reward_std": 0.49743566662073135, "rewards/cosine_scaled_reward": -0.1936263097450137, "rewards/format_reward": 0.2291666679084301, "step": 231 }, { "completion_length": 3401.416717529297, "epoch": 0.09970453934998658, "grad_norm": 0.15008047223091125, "kl": 0.1064453125, "learning_rate": 6.83068622519821e-07, "loss": 0.021, "reward": -0.31060757860541344, "reward_std": 0.5292501132935286, "rewards/cosine_scaled_reward": -0.2282204576767981, "rewards/format_reward": 0.1458333395421505, "step": 232 }, { "completion_length": 3470.125030517578, "epoch": 0.10013430029546065, "grad_norm": 0.2797686755657196, "kl": 0.1021728515625, "learning_rate": 6.800643086250121e-07, "loss": 0.0488, "reward": -0.29418041615281254, "reward_std": 0.5648275827988982, "rewards/cosine_scaled_reward": -0.17834021849557757, "rewards/format_reward": 0.06250000186264515, "step": 233 }, { "completion_length": 3311.000030517578, "epoch": 0.10056406124093473, "grad_norm": 0.16725048422813416, "kl": 0.098541259765625, "learning_rate": 6.770536555792944e-07, "loss": 0.0148, "reward": 0.08625454641878605, "reward_std": 0.6813838183879852, "rewards/cosine_scaled_reward": -0.12353941053152084, "rewards/format_reward": 0.33333334140479565, "step": 234 }, { "completion_length": 3378.0833740234375, "epoch": 0.10099382218640882, "grad_norm": 0.16376158595085144, "kl": 0.127685546875, "learning_rate": 6.740368101176495e-07, "loss": 0.025, "reward": -0.12530528474599123, "reward_std": 0.4470914788544178, "rewards/cosine_scaled_reward": -0.21890264190733433, "rewards/format_reward": 0.3125000111758709, "step": 235 }, { "completion_length": 3294.0416870117188, "epoch": 0.10142358313188289, "grad_norm": 0.18415580689907074, "kl": 0.121673583984375, "learning_rate": 6.710139192768694e-07, "loss": 0.0216, "reward": 0.09182750558829866, "reward_std": 0.6466180570423603, "rewards/cosine_scaled_reward": -0.08950291387736797, "rewards/format_reward": 0.2708333358168602, "step": 236 }, { "completion_length": 3185.8750610351562, "epoch": 0.10185334407735697, "grad_norm": 0.3965671956539154, "kl": 0.100250244140625, "learning_rate": 6.679851303883891e-07, "loss": 0.0413, "reward": 0.35130938421934843, "reward_std": 0.7578475214540958, "rewards/cosine_scaled_reward": 0.019404693506658077, "rewards/format_reward": 0.31250000186264515, "step": 237 }, { "completion_length": 3130.312530517578, "epoch": 0.10228310502283106, "grad_norm": 0.2065299153327942, "kl": 0.133148193359375, "learning_rate": 6.649505910711058e-07, "loss": 0.0218, "reward": -0.24415791034698486, "reward_std": 0.5432456694543362, "rewards/cosine_scaled_reward": -0.23666228912770748, "rewards/format_reward": 0.22916666977107525, "step": 238 }, { "completion_length": 3228.291717529297, "epoch": 0.10271286596830513, "grad_norm": 0.33605965971946716, "kl": 0.146240234375, "learning_rate": 6.619104492241847e-07, "loss": 0.0485, "reward": -0.2976772617548704, "reward_std": 0.5471369195729494, "rewards/cosine_scaled_reward": -0.24258863677096087, "rewards/format_reward": 0.18750000186264515, "step": 239 }, { "completion_length": 2860.291732788086, "epoch": 0.1031426269137792, "grad_norm": 0.384964257478714, "kl": 0.142333984375, "learning_rate": 6.588648530198504e-07, "loss": 0.0695, "reward": -0.0938523430377245, "reward_std": 0.7088601291179657, "rewards/cosine_scaled_reward": -0.1198428338393569, "rewards/format_reward": 0.14583333767950535, "step": 240 }, { "completion_length": 3273.0625610351562, "epoch": 0.1035723878592533, "grad_norm": 0.2744850516319275, "kl": 0.1512451171875, "learning_rate": 6.558139508961654e-07, "loss": 0.0388, "reward": 0.22037378139793873, "reward_std": 0.6518696863204241, "rewards/cosine_scaled_reward": -0.035646433010697365, "rewards/format_reward": 0.29166666977107525, "step": 241 }, { "completion_length": 3478.500030517578, "epoch": 0.10400214880472737, "grad_norm": 0.2901211678981781, "kl": 0.19769287109375, "learning_rate": 6.527578915497951e-07, "loss": 0.0278, "reward": -0.29410413303412497, "reward_std": 0.6693531088531017, "rewards/cosine_scaled_reward": -0.21996873430907726, "rewards/format_reward": 0.1458333358168602, "step": 242 }, { "completion_length": 3275.5209045410156, "epoch": 0.10443190975020145, "grad_norm": 0.19410951435565948, "kl": 0.160980224609375, "learning_rate": 6.496968239287603e-07, "loss": 0.0324, "reward": -0.22141533065587282, "reward_std": 0.501360110938549, "rewards/cosine_scaled_reward": -0.1836243411526084, "rewards/format_reward": 0.14583333767950535, "step": 243 }, { "completion_length": 3252.6041870117188, "epoch": 0.10486167069567554, "grad_norm": 0.30771979689598083, "kl": 0.1920166015625, "learning_rate": 6.466308972251785e-07, "loss": 0.0199, "reward": 0.34057202469557524, "reward_std": 0.6942340489476919, "rewards/cosine_scaled_reward": 0.014035999774932861, "rewards/format_reward": 0.31250000186264515, "step": 244 }, { "completion_length": 2905.5625762939453, "epoch": 0.10529143164114961, "grad_norm": 0.34957805275917053, "kl": 0.17431640625, "learning_rate": 6.435602608679916e-07, "loss": 0.0424, "reward": 0.20427564159035683, "reward_std": 0.8110857531428337, "rewards/cosine_scaled_reward": -0.12702885386534035, "rewards/format_reward": 0.45833334140479565, "step": 245 }, { "completion_length": 3238.8334350585938, "epoch": 0.10572119258662369, "grad_norm": 0.4242176115512848, "kl": 0.1932373046875, "learning_rate": 6.404850645156841e-07, "loss": 0.0555, "reward": -0.045491838827729225, "reward_std": 0.7336732260882854, "rewards/cosine_scaled_reward": -0.15816259011626244, "rewards/format_reward": 0.2708333395421505, "step": 246 }, { "completion_length": 3354.2291870117188, "epoch": 0.10615095353209777, "grad_norm": 0.5245217680931091, "kl": 0.199432373046875, "learning_rate": 6.374054580489873e-07, "loss": 0.0571, "reward": 0.10067729279398918, "reward_std": 0.6560806222259998, "rewards/cosine_scaled_reward": -0.05382802244275808, "rewards/format_reward": 0.20833334140479565, "step": 247 }, { "completion_length": 3355.104217529297, "epoch": 0.10658071447757185, "grad_norm": 0.3123754858970642, "kl": 0.255126953125, "learning_rate": 6.343215915635761e-07, "loss": 0.0399, "reward": -0.2293941890820861, "reward_std": 0.5414997935295105, "rewards/cosine_scaled_reward": -0.21886378154158592, "rewards/format_reward": 0.2083333358168602, "step": 248 }, { "completion_length": 3105.9376068115234, "epoch": 0.10701047542304593, "grad_norm": 0.5774914622306824, "kl": 0.240234375, "learning_rate": 6.31233615362752e-07, "loss": 0.0585, "reward": -0.05396188795566559, "reward_std": 0.8359964787960052, "rewards/cosine_scaled_reward": -0.18323094956576824, "rewards/format_reward": 0.31250000558793545, "step": 249 }, { "completion_length": 3342.3541870117188, "epoch": 0.10744023636852001, "grad_norm": 0.3618089258670807, "kl": 0.30517578125, "learning_rate": 6.281416799501187e-07, "loss": 0.0203, "reward": -0.18617781065404415, "reward_std": 0.4795129355043173, "rewards/cosine_scaled_reward": -0.1868389081209898, "rewards/format_reward": 0.18750000186264515, "step": 250 }, { "completion_length": 3042.875045776367, "epoch": 0.10786999731399409, "grad_norm": 0.37689244747161865, "kl": 0.289794921875, "learning_rate": 6.25045936022246e-07, "loss": 0.0324, "reward": 0.16125685814768076, "reward_std": 0.6665598265826702, "rewards/cosine_scaled_reward": -0.13812158396467566, "rewards/format_reward": 0.4375000037252903, "step": 251 }, { "completion_length": 3482.4166870117188, "epoch": 0.10829975825946817, "grad_norm": 0.420097291469574, "kl": 0.3709716796875, "learning_rate": 6.219465344613258e-07, "loss": 0.0529, "reward": -0.16390705294907093, "reward_std": 0.675162598490715, "rewards/cosine_scaled_reward": -0.1652868576347828, "rewards/format_reward": 0.16666666977107525, "step": 252 }, { "completion_length": 3059.0833892822266, "epoch": 0.10872951920494225, "grad_norm": 0.40207359194755554, "kl": 0.29473876953125, "learning_rate": 6.188436263278172e-07, "loss": 0.0414, "reward": 0.2715332768857479, "reward_std": 0.9526352770626545, "rewards/cosine_scaled_reward": -0.06215003225952387, "rewards/format_reward": 0.3958333395421505, "step": 253 }, { "completion_length": 3305.750030517578, "epoch": 0.10915928015041633, "grad_norm": 0.4389883577823639, "kl": 0.30810546875, "learning_rate": 6.157373628530852e-07, "loss": 0.0202, "reward": -0.1759735383093357, "reward_std": 0.5703559443354607, "rewards/cosine_scaled_reward": -0.17132011079229414, "rewards/format_reward": 0.1666666679084301, "step": 254 }, { "completion_length": 3313.604217529297, "epoch": 0.1095890410958904, "grad_norm": 0.46990692615509033, "kl": 0.34814453125, "learning_rate": 6.126278954320294e-07, "loss": 0.0705, "reward": -0.15743695013225079, "reward_std": 0.6556378323584795, "rewards/cosine_scaled_reward": -0.17246847320348024, "rewards/format_reward": 0.1875000074505806, "step": 255 }, { "completion_length": 3240.312530517578, "epoch": 0.1100188020413645, "grad_norm": 0.5203275084495544, "kl": 0.3746337890625, "learning_rate": 6.095153756157051e-07, "loss": 0.0213, "reward": -0.19026320055127144, "reward_std": 0.6170386075973511, "rewards/cosine_scaled_reward": -0.20971493795514107, "rewards/format_reward": 0.2291666679084301, "step": 256 }, { "completion_length": 3506.2083435058594, "epoch": 0.11044856298683857, "grad_norm": 0.4531089663505554, "kl": 0.3936767578125, "learning_rate": 6.06399955103937e-07, "loss": 0.0364, "reward": -0.2065655398182571, "reward_std": 0.6821424793452024, "rewards/cosine_scaled_reward": -0.17619943688623607, "rewards/format_reward": 0.1458333358168602, "step": 257 }, { "completion_length": 3384.166748046875, "epoch": 0.11087832393231264, "grad_norm": 0.535594642162323, "kl": 0.34912109375, "learning_rate": 6.032817857379256e-07, "loss": 0.0645, "reward": 0.20002377592027187, "reward_std": 0.8622936233878136, "rewards/cosine_scaled_reward": -0.06665478553622961, "rewards/format_reward": 0.3333333432674408, "step": 258 }, { "completion_length": 3224.7291870117188, "epoch": 0.11130808487778673, "grad_norm": 0.7016111016273499, "kl": 0.3460693359375, "learning_rate": 6.001610194928464e-07, "loss": 0.0947, "reward": 0.0013799052685499191, "reward_std": 0.7243241295218468, "rewards/cosine_scaled_reward": -0.13472672598436475, "rewards/format_reward": 0.27083334140479565, "step": 259 }, { "completion_length": 3215.916717529297, "epoch": 0.11173784582326081, "grad_norm": 0.37652695178985596, "kl": 0.3382568359375, "learning_rate": 5.97037808470444e-07, "loss": 0.0334, "reward": -0.21050314977765083, "reward_std": 0.5167079046368599, "rewards/cosine_scaled_reward": -0.24066824652254581, "rewards/format_reward": 0.27083333767950535, "step": 260 }, { "completion_length": 2992.3750610351562, "epoch": 0.11216760676873488, "grad_norm": 0.4304850101470947, "kl": 0.323974609375, "learning_rate": 5.939123048916173e-07, "loss": 0.0466, "reward": 0.027021964080631733, "reward_std": 0.6674534231424332, "rewards/cosine_scaled_reward": -0.13232235587202013, "rewards/format_reward": 0.2916666716337204, "step": 261 }, { "completion_length": 3310.0625610351562, "epoch": 0.11259736771420897, "grad_norm": 0.2637018859386444, "kl": 0.3104248046875, "learning_rate": 5.907846610890011e-07, "loss": 0.0376, "reward": -0.1610091393813491, "reward_std": 0.531306691467762, "rewards/cosine_scaled_reward": -0.23675457946956158, "rewards/format_reward": 0.31250001303851604, "step": 262 }, { "completion_length": 3168.5416870117188, "epoch": 0.11302712865968305, "grad_norm": 0.28266096115112305, "kl": 0.305419921875, "learning_rate": 5.87655029499542e-07, "loss": 0.0322, "reward": -0.0013561546802520752, "reward_std": 0.5143809728324413, "rewards/cosine_scaled_reward": -0.15692808292806149, "rewards/format_reward": 0.3125000074505806, "step": 263 }, { "completion_length": 2851.0209197998047, "epoch": 0.11345688960515714, "grad_norm": 0.308773398399353, "kl": 0.30206298828125, "learning_rate": 5.845235626570683e-07, "loss": 0.0414, "reward": 0.2397294081747532, "reward_std": 0.7897104993462563, "rewards/cosine_scaled_reward": -0.0988853108137846, "rewards/format_reward": 0.4375000111758709, "step": 264 }, { "completion_length": 3474.416717529297, "epoch": 0.11388665055063121, "grad_norm": 0.6649038791656494, "kl": 0.387939453125, "learning_rate": 5.813904131848564e-07, "loss": 0.0174, "reward": -0.07892851100768894, "reward_std": 0.436942171305418, "rewards/cosine_scaled_reward": -0.15404759347438812, "rewards/format_reward": 0.2291666679084301, "step": 265 }, { "completion_length": 3210.3958740234375, "epoch": 0.11431641149610529, "grad_norm": 0.3406374454498291, "kl": 0.3203125, "learning_rate": 5.78255733788191e-07, "loss": 0.042, "reward": -0.17140778026077896, "reward_std": 0.6471902802586555, "rewards/cosine_scaled_reward": -0.17945388611406088, "rewards/format_reward": 0.1875000074505806, "step": 266 }, { "completion_length": 2659.062515258789, "epoch": 0.11474617244157938, "grad_norm": 0.2383652925491333, "kl": 0.26202392578125, "learning_rate": 5.751196772469237e-07, "loss": 0.0205, "reward": 0.12675385177135468, "reward_std": 0.6533542461693287, "rewards/cosine_scaled_reward": -0.14495642157271504, "rewards/format_reward": 0.41666667349636555, "step": 267 }, { "completion_length": 3044.375030517578, "epoch": 0.11517593338705345, "grad_norm": 0.9735339879989624, "kl": 0.29833984375, "learning_rate": 5.71982396408026e-07, "loss": 0.0711, "reward": 0.5068819033913314, "reward_std": 1.0000681914389133, "rewards/cosine_scaled_reward": 0.003440950531512499, "rewards/format_reward": 0.5000000149011612, "step": 268 }, { "completion_length": 3136.041748046875, "epoch": 0.11560569433252753, "grad_norm": 1.0504587888717651, "kl": 0.2852783203125, "learning_rate": 5.688440441781398e-07, "loss": 0.0757, "reward": 0.22405152022838593, "reward_std": 0.8469377644360065, "rewards/cosine_scaled_reward": -0.11714090965688229, "rewards/format_reward": 0.45833334885537624, "step": 269 }, { "completion_length": 3260.0416870117188, "epoch": 0.11603545527800162, "grad_norm": 0.2590119242668152, "kl": 0.37548828125, "learning_rate": 5.657047735161255e-07, "loss": 0.0441, "reward": -0.3106625219807029, "reward_std": 0.41818269342184067, "rewards/cosine_scaled_reward": -0.24908126704394817, "rewards/format_reward": 0.18750000558793545, "step": 270 }, { "completion_length": 2952.4583892822266, "epoch": 0.11646521622347569, "grad_norm": 0.39680132269859314, "kl": 0.32830810546875, "learning_rate": 5.625647374256061e-07, "loss": 0.0461, "reward": 0.3552531637251377, "reward_std": 0.5729935504496098, "rewards/cosine_scaled_reward": -0.04112342558801174, "rewards/format_reward": 0.43750000558793545, "step": 271 }, { "completion_length": 2728.604202270508, "epoch": 0.11689497716894977, "grad_norm": 0.5533512234687805, "kl": 0.373382568359375, "learning_rate": 5.594240889475106e-07, "loss": 0.0621, "reward": 0.3282827250659466, "reward_std": 0.7863426059484482, "rewards/cosine_scaled_reward": -0.03377529792487621, "rewards/format_reward": 0.39583333767950535, "step": 272 }, { "completion_length": 3247.2083435058594, "epoch": 0.11732473811442386, "grad_norm": 0.4072413742542267, "kl": 0.411376953125, "learning_rate": 5.562829811526154e-07, "loss": 0.0501, "reward": 0.007193543016910553, "reward_std": 0.6402526348829269, "rewards/cosine_scaled_reward": -0.16306989826261997, "rewards/format_reward": 0.33333334513008595, "step": 273 }, { "completion_length": 2827.9375610351562, "epoch": 0.11775449905989793, "grad_norm": 0.6651548147201538, "kl": 0.3885498046875, "learning_rate": 5.531415671340826e-07, "loss": 0.0536, "reward": 0.5889218859374523, "reward_std": 0.798742763698101, "rewards/cosine_scaled_reward": -0.007622419856488705, "rewards/format_reward": 0.6041666865348816, "step": 274 }, { "completion_length": 3124.937530517578, "epoch": 0.11818426000537201, "grad_norm": 0.7633119225502014, "kl": 0.49609375, "learning_rate": 5.5e-07, "loss": 0.0809, "reward": 0.07992564514279366, "reward_std": 0.7367474138736725, "rewards/cosine_scaled_reward": -0.10587052209302783, "rewards/format_reward": 0.29166667349636555, "step": 275 }, { "completion_length": 3260.229217529297, "epoch": 0.1186140209508461, "grad_norm": 1.2255624532699585, "kl": 0.52685546875, "learning_rate": 5.468584328659172e-07, "loss": 0.0976, "reward": 0.09537929436191916, "reward_std": 0.813844308257103, "rewards/cosine_scaled_reward": -0.10856035631150007, "rewards/format_reward": 0.3125000074505806, "step": 276 }, { "completion_length": 3218.791717529297, "epoch": 0.11904378189632017, "grad_norm": 1.194951057434082, "kl": 0.533447265625, "learning_rate": 5.437170188473847e-07, "loss": 0.092, "reward": 0.3788673263043165, "reward_std": 1.0014424696564674, "rewards/cosine_scaled_reward": -0.00848300801590085, "rewards/format_reward": 0.39583334513008595, "step": 277 }, { "completion_length": 2808.625045776367, "epoch": 0.11947354284179425, "grad_norm": 0.7314147353172302, "kl": 0.6029052734375, "learning_rate": 5.405759110524894e-07, "loss": 0.093, "reward": 0.442706611007452, "reward_std": 0.7734650187194347, "rewards/cosine_scaled_reward": -0.01823002379387617, "rewards/format_reward": 0.47916667349636555, "step": 278 }, { "completion_length": 2802.5000762939453, "epoch": 0.11990330378726834, "grad_norm": 0.5121820569038391, "kl": 0.6251220703125, "learning_rate": 5.37435262574394e-07, "loss": 0.0719, "reward": 0.1671016328036785, "reward_std": 0.6097588539123535, "rewards/cosine_scaled_reward": -0.14561585523188114, "rewards/format_reward": 0.4583333432674408, "step": 279 }, { "completion_length": 2956.354217529297, "epoch": 0.12033306473274241, "grad_norm": 1.6248406171798706, "kl": 0.6669921875, "learning_rate": 5.342952264838747e-07, "loss": 0.0109, "reward": -0.1070807955111377, "reward_std": 0.620513990521431, "rewards/cosine_scaled_reward": -0.25145706813782454, "rewards/format_reward": 0.39583334140479565, "step": 280 }, { "completion_length": 2680.875045776367, "epoch": 0.12076282567821649, "grad_norm": 0.7339856028556824, "kl": 0.64013671875, "learning_rate": 5.311559558218603e-07, "loss": 0.0724, "reward": 0.3123179245740175, "reward_std": 0.6551225483417511, "rewards/cosine_scaled_reward": -0.10425769072026014, "rewards/format_reward": 0.520833345130086, "step": 281 }, { "completion_length": 3147.541717529297, "epoch": 0.12119258662369058, "grad_norm": 0.6487550139427185, "kl": 0.7685546875, "learning_rate": 5.28017603591974e-07, "loss": 0.0611, "reward": 0.16217099921777844, "reward_std": 0.729453157633543, "rewards/cosine_scaled_reward": -0.18974783970043063, "rewards/format_reward": 0.5416666753590107, "step": 282 }, { "completion_length": 3113.1250610351562, "epoch": 0.12162234756916465, "grad_norm": 0.6284691691398621, "kl": 0.68798828125, "learning_rate": 5.248803227530763e-07, "loss": 0.0684, "reward": -0.1357586346566677, "reward_std": 0.5350559465587139, "rewards/cosine_scaled_reward": -0.224129319190979, "rewards/format_reward": 0.31250000558793545, "step": 283 }, { "completion_length": 3134.750045776367, "epoch": 0.12205210851463873, "grad_norm": 1.4097696542739868, "kl": 0.679443359375, "learning_rate": 5.21744266211809e-07, "loss": 0.0281, "reward": 0.10988862859085202, "reward_std": 0.5274305082857609, "rewards/cosine_scaled_reward": -0.059639010578393936, "rewards/format_reward": 0.22916666977107525, "step": 284 }, { "completion_length": 2986.1875610351562, "epoch": 0.12248186946011282, "grad_norm": 0.6880308985710144, "kl": 0.595458984375, "learning_rate": 5.186095868151436e-07, "loss": 0.0442, "reward": -0.1320501370355487, "reward_std": 0.6709931120276451, "rewards/cosine_scaled_reward": -0.2222750741057098, "rewards/format_reward": 0.31250000558793545, "step": 285 }, { "completion_length": 2816.6250915527344, "epoch": 0.12291163040558689, "grad_norm": 0.470337450504303, "kl": 0.465087890625, "learning_rate": 5.154764373429315e-07, "loss": 0.0551, "reward": 0.559965031221509, "reward_std": 0.507901031523943, "rewards/cosine_scaled_reward": 0.02998252399265766, "rewards/format_reward": 0.5000000093132257, "step": 286 }, { "completion_length": 2894.2084045410156, "epoch": 0.12334139135106097, "grad_norm": 1.0256301164627075, "kl": 0.4281005859375, "learning_rate": 5.123449705004581e-07, "loss": 0.0826, "reward": 0.006942304782569408, "reward_std": 0.8363403342664242, "rewards/cosine_scaled_reward": -0.16319551563356072, "rewards/format_reward": 0.3333333432674408, "step": 287 }, { "completion_length": 2988.104248046875, "epoch": 0.12377115229653506, "grad_norm": 0.7020933032035828, "kl": 0.41851806640625, "learning_rate": 5.09215338910999e-07, "loss": 0.0364, "reward": 0.161713290726766, "reward_std": 0.7903231792151928, "rewards/cosine_scaled_reward": -0.05456002429127693, "rewards/format_reward": 0.2708333358168602, "step": 288 }, { "completion_length": 2598.333396911621, "epoch": 0.12420091324200913, "grad_norm": 0.893773078918457, "kl": 0.32745361328125, "learning_rate": 5.060876951083828e-07, "loss": 0.0488, "reward": 0.4967697560787201, "reward_std": 0.8897123076021671, "rewards/cosine_scaled_reward": 0.06088486174121499, "rewards/format_reward": 0.3750000074505806, "step": 289 }, { "completion_length": 2831.2709350585938, "epoch": 0.1246306741874832, "grad_norm": 0.668092668056488, "kl": 0.3631591796875, "learning_rate": 5.02962191529556e-07, "loss": 0.0326, "reward": 0.030740363523364067, "reward_std": 0.5826243050396442, "rewards/cosine_scaled_reward": -0.2137964954599738, "rewards/format_reward": 0.4583333395421505, "step": 290 }, { "completion_length": 2395.7500610351562, "epoch": 0.12506043513295728, "grad_norm": 0.5597040057182312, "kl": 0.29315185546875, "learning_rate": 4.998389805071536e-07, "loss": 0.0159, "reward": 0.1374213770031929, "reward_std": 0.8964235186576843, "rewards/cosine_scaled_reward": -0.1396226561628282, "rewards/format_reward": 0.41666667722165585, "step": 291 }, { "completion_length": 2315.3125228881836, "epoch": 0.12549019607843137, "grad_norm": 0.3150510787963867, "kl": 0.2481689453125, "learning_rate": 4.967182142620745e-07, "loss": 0.0337, "reward": 0.46068116615060717, "reward_std": 0.7595715560019016, "rewards/cosine_scaled_reward": -0.08215942978858948, "rewards/format_reward": 0.6250000167638063, "step": 292 }, { "completion_length": 3021.2084350585938, "epoch": 0.12591995702390546, "grad_norm": 0.6819506287574768, "kl": 0.3896484375, "learning_rate": 4.93600044896063e-07, "loss": 0.0441, "reward": 0.08296144660562277, "reward_std": 0.8795575201511383, "rewards/cosine_scaled_reward": -0.16685261274687946, "rewards/format_reward": 0.4166666716337204, "step": 293 }, { "completion_length": 3169.6875915527344, "epoch": 0.12634971796937952, "grad_norm": 1.3509440422058105, "kl": 0.37548828125, "learning_rate": 4.904846243842949e-07, "loss": 0.0921, "reward": -0.008212616667151451, "reward_std": 0.8048211708664894, "rewards/cosine_scaled_reward": -0.1499396455474198, "rewards/format_reward": 0.2916666716337204, "step": 294 }, { "completion_length": 2602.0208587646484, "epoch": 0.1267794789148536, "grad_norm": 0.5518004298210144, "kl": 0.350341796875, "learning_rate": 4.873721045679706e-07, "loss": 0.0447, "reward": 0.206179516389966, "reward_std": 0.5499585885554552, "rewards/cosine_scaled_reward": -0.18857692973688245, "rewards/format_reward": 0.5833333376795053, "step": 295 }, { "completion_length": 2984.354248046875, "epoch": 0.1272092398603277, "grad_norm": 0.5183817148208618, "kl": 0.4542236328125, "learning_rate": 4.842626371469149e-07, "loss": 0.0499, "reward": 0.2680262606590986, "reward_std": 0.7054074257612228, "rewards/cosine_scaled_reward": -0.03265354223549366, "rewards/format_reward": 0.33333334140479565, "step": 296 }, { "completion_length": 2658.8125610351562, "epoch": 0.12763900080580176, "grad_norm": 0.5200396180152893, "kl": 0.366455078125, "learning_rate": 4.811563736721829e-07, "loss": 0.0601, "reward": 0.325771301984787, "reward_std": 0.7670985423028469, "rewards/cosine_scaled_reward": -0.09753102250397205, "rewards/format_reward": 0.5208333469927311, "step": 297 }, { "completion_length": 2891.354217529297, "epoch": 0.12806876175127585, "grad_norm": 0.5782116651535034, "kl": 0.46734619140625, "learning_rate": 4.780534655386743e-07, "loss": 0.0457, "reward": 0.10310862050391734, "reward_std": 0.612139705568552, "rewards/cosine_scaled_reward": -0.21927903965115547, "rewards/format_reward": 0.5416666809469461, "step": 298 }, { "completion_length": 2639.375030517578, "epoch": 0.12849852269674994, "grad_norm": 0.9155395030975342, "kl": 0.46527099609375, "learning_rate": 4.749540639777539e-07, "loss": 0.0144, "reward": 0.11378751136362553, "reward_std": 0.5982479602098465, "rewards/cosine_scaled_reward": -0.1618562415242195, "rewards/format_reward": 0.4375000074505806, "step": 299 }, { "completion_length": 3201.3125610351562, "epoch": 0.128928283642224, "grad_norm": 0.7331289052963257, "kl": 0.67919921875, "learning_rate": 4.7185832004988133e-07, "loss": 0.057, "reward": -0.21577763929963112, "reward_std": 0.46773863583803177, "rewards/cosine_scaled_reward": -0.26413882430642843, "rewards/format_reward": 0.3125000074505806, "step": 300 }, { "completion_length": 3117.229248046875, "epoch": 0.1293580445876981, "grad_norm": 1.407814860343933, "kl": 0.608154296875, "learning_rate": 4.68766384637248e-07, "loss": 0.0166, "reward": -0.03413659578654915, "reward_std": 0.5183165818452835, "rewards/cosine_scaled_reward": -0.21498496271669865, "rewards/format_reward": 0.3958333432674408, "step": 301 }, { "completion_length": 2633.8125610351562, "epoch": 0.12978780553317218, "grad_norm": 0.8635872602462769, "kl": 0.3875732421875, "learning_rate": 4.656784084364238e-07, "loss": 0.0627, "reward": 0.2304842546582222, "reward_std": 0.7685089334845543, "rewards/cosine_scaled_reward": -0.1347578950226307, "rewards/format_reward": 0.5000000093132257, "step": 302 }, { "completion_length": 3020.1666870117188, "epoch": 0.13021756647864624, "grad_norm": 0.7658495903015137, "kl": 0.464599609375, "learning_rate": 4.6259454195101267e-07, "loss": 0.0376, "reward": 0.5107510685920715, "reward_std": 0.8135499954223633, "rewards/cosine_scaled_reward": -0.06754114106297493, "rewards/format_reward": 0.6458333488553762, "step": 303 }, { "completion_length": 2636.8750915527344, "epoch": 0.13064732742412033, "grad_norm": 0.6815863847732544, "kl": 0.393768310546875, "learning_rate": 4.59514935484316e-07, "loss": 0.0367, "reward": 0.41921078506857157, "reward_std": 0.9464630410075188, "rewards/cosine_scaled_reward": -0.019561282824724913, "rewards/format_reward": 0.45833334885537624, "step": 304 }, { "completion_length": 3151.354278564453, "epoch": 0.13107708836959442, "grad_norm": 0.8595097064971924, "kl": 0.47998046875, "learning_rate": 4.5643973913200837e-07, "loss": 0.0522, "reward": 0.4590230342000723, "reward_std": 0.8410362135618925, "rewards/cosine_scaled_reward": -0.01007182092871517, "rewards/format_reward": 0.47916667349636555, "step": 305 }, { "completion_length": 2835.312545776367, "epoch": 0.13150684931506848, "grad_norm": 0.6221079230308533, "kl": 0.4422607421875, "learning_rate": 4.5336910277482155e-07, "loss": 0.0542, "reward": 0.16088844509795308, "reward_std": 0.5803754720836878, "rewards/cosine_scaled_reward": -0.13830577954649925, "rewards/format_reward": 0.4375000074505806, "step": 306 }, { "completion_length": 2835.8125610351562, "epoch": 0.13193661026054257, "grad_norm": 0.6231449246406555, "kl": 0.40673828125, "learning_rate": 4.503031760712397e-07, "loss": 0.0269, "reward": 0.1736403852701187, "reward_std": 0.8766268193721771, "rewards/cosine_scaled_reward": -0.04859647741250228, "rewards/format_reward": 0.27083334140479565, "step": 307 }, { "completion_length": 2986.8750610351562, "epoch": 0.13236637120601666, "grad_norm": 0.9265304207801819, "kl": 0.413330078125, "learning_rate": 4.4724210845020494e-07, "loss": 0.0537, "reward": 0.42360828816890717, "reward_std": 0.8784520626068115, "rewards/cosine_scaled_reward": -0.10069586709141731, "rewards/format_reward": 0.6250000093132257, "step": 308 }, { "completion_length": 3135.6250610351562, "epoch": 0.13279613215149072, "grad_norm": 0.46817779541015625, "kl": 0.4849853515625, "learning_rate": 4.441860491038345e-07, "loss": 0.0446, "reward": -0.13473229110240936, "reward_std": 0.7253297679126263, "rewards/cosine_scaled_reward": -0.24444948695600033, "rewards/format_reward": 0.35416667349636555, "step": 309 }, { "completion_length": 2781.8958892822266, "epoch": 0.1332258930969648, "grad_norm": 1.2285687923431396, "kl": 0.4296875, "learning_rate": 4.4113514698014953e-07, "loss": -0.007, "reward": 0.22559093311429024, "reward_std": 0.718120601028204, "rewards/cosine_scaled_reward": -0.15803787391632795, "rewards/format_reward": 0.5416666772216558, "step": 310 }, { "completion_length": 2901.479217529297, "epoch": 0.1336556540424389, "grad_norm": 1.1696747541427612, "kl": 0.560546875, "learning_rate": 4.3808955077581546e-07, "loss": 0.0884, "reward": 0.8927975045517087, "reward_std": 0.9878052957355976, "rewards/cosine_scaled_reward": 0.14431539364159107, "rewards/format_reward": 0.6041666809469461, "step": 311 }, { "completion_length": 2734.520896911621, "epoch": 0.13408541498791296, "grad_norm": 0.48847606778144836, "kl": 0.365936279296875, "learning_rate": 4.350494089288943e-07, "loss": 0.0506, "reward": 0.2431645654141903, "reward_std": 0.746571522206068, "rewards/cosine_scaled_reward": -0.06591770891100168, "rewards/format_reward": 0.37500000931322575, "step": 312 }, { "completion_length": 2792.9584197998047, "epoch": 0.13451517593338705, "grad_norm": 0.7591063380241394, "kl": 0.53125, "learning_rate": 4.3201486961161093e-07, "loss": 0.0465, "reward": 0.47540872171521187, "reward_std": 0.7935304790735245, "rewards/cosine_scaled_reward": -0.0747956451959908, "rewards/format_reward": 0.6250000260770321, "step": 313 }, { "completion_length": 2417.9583740234375, "epoch": 0.13494493687886114, "grad_norm": 1.4392017126083374, "kl": 0.36376953125, "learning_rate": 4.2898608072313045e-07, "loss": 0.0888, "reward": 0.8342535467818379, "reward_std": 0.8982183635234833, "rewards/cosine_scaled_reward": 0.1462934287264943, "rewards/format_reward": 0.5416666809469461, "step": 314 }, { "completion_length": 2910.416732788086, "epoch": 0.13537469782433523, "grad_norm": 0.5072810649871826, "kl": 0.543212890625, "learning_rate": 4.2596318988235037e-07, "loss": 0.0644, "reward": 0.38547870388720185, "reward_std": 0.7502098754048347, "rewards/cosine_scaled_reward": -0.1093439944088459, "rewards/format_reward": 0.6041666846722364, "step": 315 }, { "completion_length": 2920.1875610351562, "epoch": 0.1358044587698093, "grad_norm": 0.6674709916114807, "kl": 0.58349609375, "learning_rate": 4.2294634442070553e-07, "loss": 0.0731, "reward": 0.35920436796732247, "reward_std": 0.8548001684248447, "rewards/cosine_scaled_reward": -0.08081448613665998, "rewards/format_reward": 0.520833345130086, "step": 316 }, { "completion_length": 2953.5833892822266, "epoch": 0.13623421971528338, "grad_norm": 0.594467043876648, "kl": 0.5233154296875, "learning_rate": 4.1993569137498776e-07, "loss": 0.0509, "reward": 0.06915194261819124, "reward_std": 0.6348052583634853, "rewards/cosine_scaled_reward": -0.1425073640421033, "rewards/format_reward": 0.35416667349636555, "step": 317 }, { "completion_length": 2890.5208740234375, "epoch": 0.13666398066075747, "grad_norm": 1.1461410522460938, "kl": 0.5965576171875, "learning_rate": 4.1693137748017915e-07, "loss": 0.0262, "reward": -0.11665917420759797, "reward_std": 0.6380915716290474, "rewards/cosine_scaled_reward": -0.18332959711551666, "rewards/format_reward": 0.2500000037252903, "step": 318 }, { "completion_length": 2783.3959197998047, "epoch": 0.13709374160623153, "grad_norm": 0.6669846177101135, "kl": 0.6087646484375, "learning_rate": 4.1393354916230005e-07, "loss": 0.0678, "reward": 0.4914609114639461, "reward_std": 0.8558991253376007, "rewards/cosine_scaled_reward": -0.09801955381408334, "rewards/format_reward": 0.6875000111758709, "step": 319 }, { "completion_length": 3056.3334350585938, "epoch": 0.13752350255170562, "grad_norm": 1.1379731893539429, "kl": 0.62646484375, "learning_rate": 4.1094235253127374e-07, "loss": 0.1013, "reward": 0.33441915828734636, "reward_std": 0.8576799184083939, "rewards/cosine_scaled_reward": -0.12445710925385356, "rewards/format_reward": 0.5833333488553762, "step": 320 }, { "completion_length": 2863.437530517578, "epoch": 0.1379532634971797, "grad_norm": 1.3984326124191284, "kl": 0.569580078125, "learning_rate": 4.079579333738039e-07, "loss": -0.0035, "reward": -0.03602213738486171, "reward_std": 0.6677563302218914, "rewards/cosine_scaled_reward": -0.23676107544451952, "rewards/format_reward": 0.4375000111758709, "step": 321 }, { "completion_length": 2754.791732788086, "epoch": 0.13838302444265377, "grad_norm": 0.884214460849762, "kl": 0.563507080078125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0556, "reward": 0.2946261502802372, "reward_std": 0.7827679850161076, "rewards/cosine_scaled_reward": -0.13393694336991757, "rewards/format_reward": 0.5625000186264515, "step": 322 }, { "completion_length": 2500.8959045410156, "epoch": 0.13881278538812786, "grad_norm": 1.3111748695373535, "kl": 0.431884765625, "learning_rate": 4.020100089676376e-07, "loss": -0.012, "reward": 0.21313768532127142, "reward_std": 0.5324850175529718, "rewards/cosine_scaled_reward": -0.17468117363750935, "rewards/format_reward": 0.5625000186264515, "step": 323 }, { "completion_length": 3045.3751220703125, "epoch": 0.13924254633360195, "grad_norm": 0.5126330256462097, "kl": 0.539306640625, "learning_rate": 3.9904679361238526e-07, "loss": 0.0586, "reward": 0.39001701725646853, "reward_std": 0.7129775620996952, "rewards/cosine_scaled_reward": -0.08624151907861233, "rewards/format_reward": 0.5625000074505806, "step": 324 }, { "completion_length": 2771.2501068115234, "epoch": 0.139672307279076, "grad_norm": 1.5353480577468872, "kl": 0.503173828125, "learning_rate": 3.9609093550344907e-07, "loss": -0.0174, "reward": -0.06061364710330963, "reward_std": 0.5887418836355209, "rewards/cosine_scaled_reward": -0.23864017147570848, "rewards/format_reward": 0.41666667722165585, "step": 325 }, { "completion_length": 2732.3542404174805, "epoch": 0.1401020682245501, "grad_norm": 0.8640240430831909, "kl": 0.39447021484375, "learning_rate": 3.931425787051832e-07, "loss": 0.0091, "reward": 0.011351191438734531, "reward_std": 0.533912954851985, "rewards/cosine_scaled_reward": -0.22349108196794987, "rewards/format_reward": 0.4583333469927311, "step": 326 }, { "completion_length": 2270.6875915527344, "epoch": 0.1405318291700242, "grad_norm": 0.38664567470550537, "kl": 0.31805419921875, "learning_rate": 3.902018669163384e-07, "loss": 0.0247, "reward": 0.48958762595430017, "reward_std": 0.5647754510864615, "rewards/cosine_scaled_reward": -0.05728952866047621, "rewards/format_reward": 0.6041666679084301, "step": 327 }, { "completion_length": 2951.229263305664, "epoch": 0.14096159011549825, "grad_norm": 0.6679341197013855, "kl": 0.4625244140625, "learning_rate": 3.872689434630585e-07, "loss": 0.0359, "reward": 0.3632780034095049, "reward_std": 1.090777151286602, "rewards/cosine_scaled_reward": -0.07877768483012915, "rewards/format_reward": 0.5208333469927311, "step": 328 }, { "completion_length": 3017.791778564453, "epoch": 0.14139135106097234, "grad_norm": 0.540334165096283, "kl": 0.393798828125, "learning_rate": 3.843439512918949e-07, "loss": 0.0298, "reward": 0.3360184940393083, "reward_std": 0.6827290989458561, "rewards/cosine_scaled_reward": -0.11324076016899198, "rewards/format_reward": 0.5625000149011612, "step": 329 }, { "completion_length": 2125.104217529297, "epoch": 0.14182111200644643, "grad_norm": 0.8820216655731201, "kl": 0.22869873046875, "learning_rate": 3.8142703296283953e-07, "loss": 0.0184, "reward": 0.1553977158619091, "reward_std": 0.7506780847907066, "rewards/cosine_scaled_reward": -0.03688447852618992, "rewards/format_reward": 0.2291666716337204, "step": 330 }, { "completion_length": 2660.104248046875, "epoch": 0.1422508729519205, "grad_norm": 1.0805774927139282, "kl": 0.255615234375, "learning_rate": 3.785183306423767e-07, "loss": -0.0397, "reward": 0.5456520663574338, "reward_std": 0.7654032185673714, "rewards/cosine_scaled_reward": 0.022826028987765312, "rewards/format_reward": 0.5000000167638063, "step": 331 }, { "completion_length": 2792.0834197998047, "epoch": 0.14268063389739458, "grad_norm": 0.45976248383522034, "kl": 0.30499267578125, "learning_rate": 3.7561798609655373e-07, "loss": 0.0446, "reward": 0.2641484900377691, "reward_std": 0.6911925561726093, "rewards/cosine_scaled_reward": -0.1908424273133278, "rewards/format_reward": 0.6458333414047956, "step": 332 }, { "completion_length": 2452.7708892822266, "epoch": 0.14311039484286867, "grad_norm": 0.4910357892513275, "kl": 0.18536376953125, "learning_rate": 3.72726140684072e-07, "loss": -0.0045, "reward": 0.4173534968867898, "reward_std": 0.6911480948328972, "rewards/cosine_scaled_reward": -0.06215659203007817, "rewards/format_reward": 0.5416666828095913, "step": 333 }, { "completion_length": 2067.541717529297, "epoch": 0.14354015578834273, "grad_norm": 0.5694600343704224, "kl": 0.17529296875, "learning_rate": 3.6984293534939737e-07, "loss": -0.0296, "reward": 0.0751588474959135, "reward_std": 0.7766067013144493, "rewards/cosine_scaled_reward": -0.12908724322915077, "rewards/format_reward": 0.3333333432674408, "step": 334 }, { "completion_length": 2454.0416870117188, "epoch": 0.14396991673381682, "grad_norm": 0.6721600890159607, "kl": 0.1517181396484375, "learning_rate": 3.6696851061588994e-07, "loss": 0.024, "reward": 0.0100714061409235, "reward_std": 0.7189024537801743, "rewards/cosine_scaled_reward": -0.161630965070799, "rewards/format_reward": 0.3333333395421505, "step": 335 }, { "completion_length": 2457.687568664551, "epoch": 0.1443996776792909, "grad_norm": 0.7564294934272766, "kl": 0.20794677734375, "learning_rate": 3.641030065789562e-07, "loss": -0.0303, "reward": 0.23273339742445387, "reward_std": 0.7151405997574329, "rewards/cosine_scaled_reward": -0.060716643929481506, "rewards/format_reward": 0.3541666753590107, "step": 336 }, { "completion_length": 2747.0625762939453, "epoch": 0.14482943862476497, "grad_norm": 0.5136157274246216, "kl": 0.19384765625, "learning_rate": 3.612465628992203e-07, "loss": -0.008, "reward": 0.3228631041420158, "reward_std": 0.863990843296051, "rewards/cosine_scaled_reward": -0.07815178856253624, "rewards/format_reward": 0.4791666753590107, "step": 337 }, { "completion_length": 2567.0625762939453, "epoch": 0.14525919957023906, "grad_norm": 0.2343512773513794, "kl": 0.175811767578125, "learning_rate": 3.5839931879571725e-07, "loss": -0.0, "reward": 0.07286785542964935, "reward_std": 0.6972073242068291, "rewards/cosine_scaled_reward": -0.14064940437674522, "rewards/format_reward": 0.35416667349636555, "step": 338 }, { "completion_length": 3021.0834045410156, "epoch": 0.14568896051571315, "grad_norm": 0.3380245566368103, "kl": 0.20489501953125, "learning_rate": 3.555614130391079e-07, "loss": -0.0167, "reward": 0.024652503663673997, "reward_std": 0.7774523422122002, "rewards/cosine_scaled_reward": -0.175173751427792, "rewards/format_reward": 0.37500000558793545, "step": 339 }, { "completion_length": 2522.458396911621, "epoch": 0.1461187214611872, "grad_norm": 1.202372670173645, "kl": 0.2108154296875, "learning_rate": 3.5273298394491515e-07, "loss": 0.0617, "reward": 0.3738751672208309, "reward_std": 0.9323135502636433, "rewards/cosine_scaled_reward": -0.05264576291665435, "rewards/format_reward": 0.4791666865348816, "step": 340 }, { "completion_length": 1932.6875762939453, "epoch": 0.1465484824066613, "grad_norm": 0.7924821376800537, "kl": 0.1331787109375, "learning_rate": 3.4991416936678276e-07, "loss": -0.0602, "reward": 0.23160698357969522, "reward_std": 0.703599464148283, "rewards/cosine_scaled_reward": 0.01163682306651026, "rewards/format_reward": 0.2083333358168602, "step": 341 }, { "completion_length": 3101.5625610351562, "epoch": 0.14697824335213538, "grad_norm": 1.124794840812683, "kl": 0.197357177734375, "learning_rate": 3.471051066897562e-07, "loss": 0.0403, "reward": -0.021835686173290014, "reward_std": 0.7787010725587606, "rewards/cosine_scaled_reward": -0.15675118938088417, "rewards/format_reward": 0.29166667349636555, "step": 342 }, { "completion_length": 2852.2084350585938, "epoch": 0.14740800429760945, "grad_norm": 0.7064228057861328, "kl": 0.183746337890625, "learning_rate": 3.4430593282358777e-07, "loss": -0.0487, "reward": -0.040804795920848846, "reward_std": 0.6819908730685711, "rewards/cosine_scaled_reward": -0.21831907203886658, "rewards/format_reward": 0.3958333432674408, "step": 343 }, { "completion_length": 2656.104217529297, "epoch": 0.14783776524308354, "grad_norm": 0.2824373245239258, "kl": 0.17974853515625, "learning_rate": 3.4151678419606233e-07, "loss": 0.0038, "reward": 0.11962158465757966, "reward_std": 0.6218832246959209, "rewards/cosine_scaled_reward": -0.15893921442329884, "rewards/format_reward": 0.4375000149011612, "step": 344 }, { "completion_length": 3311.6250610351562, "epoch": 0.14826752618855762, "grad_norm": 0.3584754467010498, "kl": 0.274627685546875, "learning_rate": 3.387377967463493e-07, "loss": 0.0307, "reward": -0.20400369982235134, "reward_std": 0.518766526132822, "rewards/cosine_scaled_reward": -0.2790851891040802, "rewards/format_reward": 0.3541666716337204, "step": 345 }, { "completion_length": 2764.9375915527344, "epoch": 0.14869728713403169, "grad_norm": 0.3505573868751526, "kl": 0.1893310546875, "learning_rate": 3.359691059183761e-07, "loss": 0.0124, "reward": 0.41804545745253563, "reward_std": 0.7321450412273407, "rewards/cosine_scaled_reward": -0.08264393848367035, "rewards/format_reward": 0.5833333563059568, "step": 346 }, { "completion_length": 2736.166763305664, "epoch": 0.14912704807950578, "grad_norm": 0.9764425158500671, "kl": 0.1334686279296875, "learning_rate": 3.3321084665422803e-07, "loss": 0.0548, "reward": 0.11044287867844105, "reward_std": 0.7204113900661469, "rewards/cosine_scaled_reward": -0.1114452462643385, "rewards/format_reward": 0.33333333767950535, "step": 347 }, { "completion_length": 2837.0000915527344, "epoch": 0.14955680902497986, "grad_norm": 0.8826309442520142, "kl": 0.22235107421875, "learning_rate": 3.3046315338757026e-07, "loss": 0.0555, "reward": 0.24655969627201557, "reward_std": 0.7638373412191868, "rewards/cosine_scaled_reward": -0.12672017142176628, "rewards/format_reward": 0.5000000149011612, "step": 348 }, { "completion_length": 2738.5626068115234, "epoch": 0.14998656997045393, "grad_norm": 0.4092019498348236, "kl": 0.22637939453125, "learning_rate": 3.2772616003709616e-07, "loss": 0.0432, "reward": 0.24562328308820724, "reward_std": 0.7735979110002518, "rewards/cosine_scaled_reward": -0.11677170475013554, "rewards/format_reward": 0.47916667349636555, "step": 349 }, { "completion_length": 2973.979202270508, "epoch": 0.15041633091592801, "grad_norm": 0.4113962650299072, "kl": 0.1741943359375, "learning_rate": 3.250000000000001e-07, "loss": 0.0125, "reward": 0.015167806297540665, "reward_std": 0.7057143785059452, "rewards/cosine_scaled_reward": -0.10699944011867046, "rewards/format_reward": 0.2291666716337204, "step": 350 }, { "completion_length": 2566.291732788086, "epoch": 0.1508460918614021, "grad_norm": 0.6885215640068054, "kl": 0.1677398681640625, "learning_rate": 3.222848061454764e-07, "loss": 0.031, "reward": -0.16475414636079222, "reward_std": 0.7159660011529922, "rewards/cosine_scaled_reward": -0.19696041010320187, "rewards/format_reward": 0.22916666977107525, "step": 351 }, { "completion_length": 2855.416732788086, "epoch": 0.15127585280687617, "grad_norm": 0.5053523182868958, "kl": 0.1993408203125, "learning_rate": 3.195807108082429e-07, "loss": 0.0005, "reward": 0.08456875383853912, "reward_std": 0.6974059194326401, "rewards/cosine_scaled_reward": -0.14521562634035945, "rewards/format_reward": 0.3750000074505806, "step": 352 }, { "completion_length": 2630.0000610351562, "epoch": 0.15170561375235025, "grad_norm": 0.5430624485015869, "kl": 0.2214813232421875, "learning_rate": 3.168878457820915e-07, "loss": 0.019, "reward": 0.594953391700983, "reward_std": 0.6635894067585468, "rewards/cosine_scaled_reward": 0.037060029804706573, "rewards/format_reward": 0.5208333432674408, "step": 353 }, { "completion_length": 2468.604248046875, "epoch": 0.15213537469782434, "grad_norm": 0.5476293563842773, "kl": 0.175384521484375, "learning_rate": 3.142063423134644e-07, "loss": 0.0243, "reward": 0.21576347574591637, "reward_std": 0.7657241113483906, "rewards/cosine_scaled_reward": -0.13170161424204707, "rewards/format_reward": 0.47916667722165585, "step": 354 }, { "completion_length": 2523.041748046875, "epoch": 0.1525651356432984, "grad_norm": 0.9749562740325928, "kl": 0.2266387939453125, "learning_rate": 3.115363310950578e-07, "loss": -0.0125, "reward": 0.7230267710983753, "reward_std": 0.9085442908108234, "rewards/cosine_scaled_reward": -0.04473661910742521, "rewards/format_reward": 0.8125000149011612, "step": 355 }, { "completion_length": 2658.0209045410156, "epoch": 0.1529948965887725, "grad_norm": 0.6088355183601379, "kl": 0.260986328125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0463, "reward": 0.09969064360484481, "reward_std": 0.7391313090920448, "rewards/cosine_scaled_reward": -0.18973802542313933, "rewards/format_reward": 0.47916667349636555, "step": 356 }, { "completion_length": 2551.229263305664, "epoch": 0.15342465753424658, "grad_norm": 0.4824003577232361, "kl": 0.216705322265625, "learning_rate": 3.062313053727671e-07, "loss": 0.0215, "reward": 0.017955926712602377, "reward_std": 0.5086074396967888, "rewards/cosine_scaled_reward": -0.14727204479277134, "rewards/format_reward": 0.3125000074505806, "step": 357 }, { "completion_length": 2536.7709045410156, "epoch": 0.15385441847972064, "grad_norm": 1.257851481437683, "kl": 0.160430908203125, "learning_rate": 3.0359654942835247e-07, "loss": 0.0987, "reward": 0.2542876647785306, "reward_std": 0.9615574702620506, "rewards/cosine_scaled_reward": -0.039522842795122415, "rewards/format_reward": 0.3333333358168602, "step": 358 }, { "completion_length": 2476.0417098999023, "epoch": 0.15428417942519473, "grad_norm": 0.40752243995666504, "kl": 0.26104736328125, "learning_rate": 3.0097380284049523e-07, "loss": 0.0275, "reward": 0.04408153332769871, "reward_std": 0.7049014717340469, "rewards/cosine_scaled_reward": -0.12379257380962372, "rewards/format_reward": 0.29166667722165585, "step": 359 }, { "completion_length": 2485.041763305664, "epoch": 0.15471394037066882, "grad_norm": 1.1878323554992676, "kl": 0.279296875, "learning_rate": 2.9836319343816397e-07, "loss": -0.0256, "reward": 0.14325070613995194, "reward_std": 0.608488917350769, "rewards/cosine_scaled_reward": -0.2096246536821127, "rewards/format_reward": 0.5625000149011612, "step": 360 }, { "completion_length": 2517.9584197998047, "epoch": 0.15514370131614288, "grad_norm": 1.3002047538757324, "kl": 0.174285888671875, "learning_rate": 2.9576484845877793e-07, "loss": 0.0351, "reward": 0.7727316580712795, "reward_std": 0.7780123837292194, "rewards/cosine_scaled_reward": 0.05303249694406986, "rewards/format_reward": 0.6666666865348816, "step": 361 }, { "completion_length": 2507.7083892822266, "epoch": 0.15557346226161697, "grad_norm": 0.6128243803977966, "kl": 0.2149658203125, "learning_rate": 2.931788945420058e-07, "loss": 0.0214, "reward": 0.5492791645228863, "reward_std": 0.8482350520789623, "rewards/cosine_scaled_reward": 0.055889594135805964, "rewards/format_reward": 0.4375000037252903, "step": 362 }, { "completion_length": 2903.75008392334, "epoch": 0.15600322320709106, "grad_norm": 0.6711751222610474, "kl": 0.36846923828125, "learning_rate": 2.9060545772359305e-07, "loss": 0.0084, "reward": 0.5605523803969845, "reward_std": 0.8616843000054359, "rewards/cosine_scaled_reward": 0.06152619468048215, "rewards/format_reward": 0.4375000037252903, "step": 363 }, { "completion_length": 2986.625030517578, "epoch": 0.15643298415256512, "grad_norm": 0.45469310879707336, "kl": 0.44580078125, "learning_rate": 2.8804466342921987e-07, "loss": 0.0358, "reward": -0.11365249298978597, "reward_std": 0.617549367249012, "rewards/cosine_scaled_reward": -0.2026595761999488, "rewards/format_reward": 0.2916666753590107, "step": 364 }, { "completion_length": 2493.5833892822266, "epoch": 0.1568627450980392, "grad_norm": 0.841419517993927, "kl": 0.2928466796875, "learning_rate": 2.854966364683872e-07, "loss": -0.0111, "reward": 0.3437331975437701, "reward_std": 0.7947186268866062, "rewards/cosine_scaled_reward": -0.1198000768199563, "rewards/format_reward": 0.5833333432674408, "step": 365 }, { "completion_length": 2534.3333892822266, "epoch": 0.1572925060435133, "grad_norm": 0.628459632396698, "kl": 0.2187652587890625, "learning_rate": 2.829615010283344e-07, "loss": -0.0333, "reward": 0.4995113234035671, "reward_std": 1.0794470235705376, "rewards/cosine_scaled_reward": 0.031005645403638482, "rewards/format_reward": 0.43750000931322575, "step": 366 }, { "completion_length": 2803.8125915527344, "epoch": 0.15772226698898736, "grad_norm": 0.8800026774406433, "kl": 0.2694091796875, "learning_rate": 2.8043938066798645e-07, "loss": -0.0122, "reward": 0.3290752060711384, "reward_std": 0.5414085909724236, "rewards/cosine_scaled_reward": -0.1271290685981512, "rewards/format_reward": 0.5833333395421505, "step": 367 }, { "completion_length": 2533.250045776367, "epoch": 0.15815202793446145, "grad_norm": 0.629287838935852, "kl": 0.266448974609375, "learning_rate": 2.7793039831193133e-07, "loss": -0.0095, "reward": 0.26916775976860663, "reward_std": 0.6661453694105148, "rewards/cosine_scaled_reward": -0.10499946214258671, "rewards/format_reward": 0.4791666716337204, "step": 368 }, { "completion_length": 2351.8959350585938, "epoch": 0.15858178887993554, "grad_norm": 0.7635890245437622, "kl": 0.212982177734375, "learning_rate": 2.7543467624442956e-07, "loss": -0.0341, "reward": 0.2418502545915544, "reward_std": 0.7671771105378866, "rewards/cosine_scaled_reward": -0.16032487526535988, "rewards/format_reward": 0.5625000111758709, "step": 369 }, { "completion_length": 3067.500015258789, "epoch": 0.1590115498254096, "grad_norm": 0.5229648351669312, "kl": 0.3353271484375, "learning_rate": 2.729523361034538e-07, "loss": 0.0223, "reward": -0.05337494984269142, "reward_std": 0.5662420876324177, "rewards/cosine_scaled_reward": -0.2141874749213457, "rewards/format_reward": 0.37500001303851604, "step": 370 }, { "completion_length": 2689.541717529297, "epoch": 0.1594413107708837, "grad_norm": 0.38721764087677, "kl": 0.23284912109375, "learning_rate": 2.7048349887476037e-07, "loss": 0.0183, "reward": -0.2551152198575437, "reward_std": 0.6268425174057484, "rewards/cosine_scaled_reward": -0.2525576092302799, "rewards/format_reward": 0.2500000074505806, "step": 371 }, { "completion_length": 2483.3333740234375, "epoch": 0.15987107171635778, "grad_norm": 0.3538893461227417, "kl": 0.1558837890625, "learning_rate": 2.6802828488599294e-07, "loss": 0.0112, "reward": 0.46667767874896526, "reward_std": 0.47529415413737297, "rewards/cosine_scaled_reward": 0.004172150045633316, "rewards/format_reward": 0.45833334140479565, "step": 372 }, { "completion_length": 3308.979248046875, "epoch": 0.16030083266183184, "grad_norm": 0.9392606616020203, "kl": 0.281005859375, "learning_rate": 2.655868138008171e-07, "loss": 0.0356, "reward": 0.05111550260335207, "reward_std": 0.742233395576477, "rewards/cosine_scaled_reward": -0.12027557939291, "rewards/format_reward": 0.29166667349636555, "step": 373 }, { "completion_length": 3045.7709045410156, "epoch": 0.16073059360730593, "grad_norm": 0.9690319895744324, "kl": 0.287109375, "learning_rate": 2.631592046130896e-07, "loss": 0.0394, "reward": 0.5512113757431507, "reward_std": 1.0046659223735332, "rewards/cosine_scaled_reward": 0.015189019963145256, "rewards/format_reward": 0.5208333469927311, "step": 374 }, { "completion_length": 2509.7500762939453, "epoch": 0.16116035455278002, "grad_norm": 0.2824810743331909, "kl": 0.2246551513671875, "learning_rate": 2.6074557564105724e-07, "loss": 0.018, "reward": 0.5283350143581629, "reward_std": 0.572848729789257, "rewards/cosine_scaled_reward": -0.058749159798026085, "rewards/format_reward": 0.6458333414047956, "step": 375 }, { "completion_length": 3088.4584350585938, "epoch": 0.16159011549825408, "grad_norm": 0.5544974207878113, "kl": 0.32000732421875, "learning_rate": 2.583460445215911e-07, "loss": 0.06, "reward": -0.1279699569568038, "reward_std": 0.5798083059489727, "rewards/cosine_scaled_reward": -0.2306516468524933, "rewards/format_reward": 0.3333333395421505, "step": 376 }, { "completion_length": 2941.3959197998047, "epoch": 0.16201987644372817, "grad_norm": 0.7214526534080505, "kl": 0.25482177734375, "learning_rate": 2.5596072820445254e-07, "loss": -0.0117, "reward": 0.13005489902570844, "reward_std": 0.9419071450829506, "rewards/cosine_scaled_reward": -0.18497256468981504, "rewards/format_reward": 0.5000000074505806, "step": 377 }, { "completion_length": 2724.0209350585938, "epoch": 0.16244963738920226, "grad_norm": 1.0229369401931763, "kl": 0.22515869140625, "learning_rate": 2.5358974294659373e-07, "loss": -0.0263, "reward": 0.007891247660154477, "reward_std": 0.5536177344620228, "rewards/cosine_scaled_reward": -0.23563772067427635, "rewards/format_reward": 0.479166679084301, "step": 378 }, { "completion_length": 2184.50008392334, "epoch": 0.16287939833467632, "grad_norm": 1.4159295558929443, "kl": 0.159393310546875, "learning_rate": 2.512332043064913e-07, "loss": 0.0376, "reward": 0.4861293099820614, "reward_std": 0.9369241334497929, "rewards/cosine_scaled_reward": 0.04514796147122979, "rewards/format_reward": 0.3958333469927311, "step": 379 }, { "completion_length": 2573.416702270508, "epoch": 0.1633091592801504, "grad_norm": 1.2069127559661865, "kl": 0.278289794921875, "learning_rate": 2.488912271385139e-07, "loss": -0.0232, "reward": 0.6741883035283536, "reward_std": 0.7952763140201569, "rewards/cosine_scaled_reward": 0.035010804422199726, "rewards/format_reward": 0.6041666697710752, "step": 380 }, { "completion_length": 3240.541748046875, "epoch": 0.1637389202256245, "grad_norm": 1.1569398641586304, "kl": 0.3238525390625, "learning_rate": 2.465639255873246e-07, "loss": 0.056, "reward": 0.007905647158622742, "reward_std": 0.6461758073419333, "rewards/cosine_scaled_reward": -0.19396385177969933, "rewards/format_reward": 0.39583334140479565, "step": 381 }, { "completion_length": 2616.354248046875, "epoch": 0.16416868117109856, "grad_norm": 0.42863553762435913, "kl": 0.2731781005859375, "learning_rate": 2.4425141308231765e-07, "loss": 0.0228, "reward": 0.23188474751077592, "reward_std": 0.5089180562645197, "rewards/cosine_scaled_reward": -0.13405762985348701, "rewards/format_reward": 0.5000000074505806, "step": 382 }, { "completion_length": 3060.729217529297, "epoch": 0.16459844211657265, "grad_norm": 0.35173001885414124, "kl": 0.316162109375, "learning_rate": 2.4195380233209006e-07, "loss": 0.0123, "reward": 0.04300548415631056, "reward_std": 0.7137174494564533, "rewards/cosine_scaled_reward": -0.17641392163932323, "rewards/format_reward": 0.39583334513008595, "step": 383 }, { "completion_length": 2678.3542098999023, "epoch": 0.16502820306204674, "grad_norm": 0.3579236567020416, "kl": 0.27490234375, "learning_rate": 2.3967120531894857e-07, "loss": 0.0349, "reward": 0.4016927039483562, "reward_std": 0.7269558496773243, "rewards/cosine_scaled_reward": -0.04915365343913436, "rewards/format_reward": 0.5000000055879354, "step": 384 }, { "completion_length": 2681.291717529297, "epoch": 0.16545796400752083, "grad_norm": 0.9293654561042786, "kl": 0.242706298828125, "learning_rate": 2.374037332934512e-07, "loss": -0.0136, "reward": 0.46986853424459696, "reward_std": 0.7380744218826294, "rewards/cosine_scaled_reward": -0.05673240125179291, "rewards/format_reward": 0.5833333507180214, "step": 385 }, { "completion_length": 2559.6250534057617, "epoch": 0.1658877249529949, "grad_norm": 0.5653667449951172, "kl": 0.24530029296875, "learning_rate": 2.3515149676898552e-07, "loss": 0.0381, "reward": 0.5297290608286858, "reward_std": 0.7085526902228594, "rewards/cosine_scaled_reward": -0.016385470516979694, "rewards/format_reward": 0.5625000111758709, "step": 386 }, { "completion_length": 2896.354202270508, "epoch": 0.16631748589846898, "grad_norm": 0.6653679609298706, "kl": 0.26513671875, "learning_rate": 2.3291460551638237e-07, "loss": 0.0484, "reward": 0.5095941107720137, "reward_std": 0.7029152773320675, "rewards/cosine_scaled_reward": -0.026452949037775397, "rewards/format_reward": 0.5625000111758709, "step": 387 }, { "completion_length": 2324.062530517578, "epoch": 0.16674724684394307, "grad_norm": 1.4476134777069092, "kl": 0.2347412109375, "learning_rate": 2.306931685585657e-07, "loss": -0.035, "reward": 0.3623122707940638, "reward_std": 0.7447656877338886, "rewards/cosine_scaled_reward": 0.02490611933171749, "rewards/format_reward": 0.31250000931322575, "step": 388 }, { "completion_length": 2795.2709350585938, "epoch": 0.16717700778941713, "grad_norm": 0.8418478965759277, "kl": 0.2666015625, "learning_rate": 2.2848729416523859e-07, "loss": 0.0523, "reward": 0.2763203107751906, "reward_std": 0.6783265620470047, "rewards/cosine_scaled_reward": -0.1222565226489678, "rewards/format_reward": 0.5208333414047956, "step": 389 }, { "completion_length": 2872.1250610351562, "epoch": 0.16760676873489122, "grad_norm": 0.34217265248298645, "kl": 0.2882080078125, "learning_rate": 2.2629708984760706e-07, "loss": 0.0416, "reward": 0.055314210563665256, "reward_std": 0.5723797865211964, "rewards/cosine_scaled_reward": -0.24317624419927597, "rewards/format_reward": 0.541666679084301, "step": 390 }, { "completion_length": 3269.0001220703125, "epoch": 0.1680365296803653, "grad_norm": 0.436880499124527, "kl": 0.282470703125, "learning_rate": 2.2412266235313973e-07, "loss": 0.0108, "reward": 0.27972774649970233, "reward_std": 0.7292295396327972, "rewards/cosine_scaled_reward": -0.16221946198493242, "rewards/format_reward": 0.6041666846722364, "step": 391 }, { "completion_length": 2715.0833740234375, "epoch": 0.16846629062583937, "grad_norm": 0.3619132339954376, "kl": 0.298828125, "learning_rate": 2.2196411766036487e-07, "loss": 0.0356, "reward": 0.409868448972702, "reward_std": 0.5239976085722446, "rewards/cosine_scaled_reward": 0.01743422821164131, "rewards/format_reward": 0.3750000111758709, "step": 392 }, { "completion_length": 2428.5000915527344, "epoch": 0.16889605157131346, "grad_norm": 0.36444830894470215, "kl": 0.22918701171875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0085, "reward": 0.3471107608638704, "reward_std": 0.6408367231488228, "rewards/cosine_scaled_reward": -0.0868612986523658, "rewards/format_reward": 0.520833333954215, "step": 393 }, { "completion_length": 2497.1875762939453, "epoch": 0.16932581251678755, "grad_norm": 0.4226372539997101, "kl": 0.239166259765625, "learning_rate": 2.1769509671835223e-07, "loss": 0.0239, "reward": 0.21845147479325533, "reward_std": 0.6216540783643723, "rewards/cosine_scaled_reward": -0.14077427051961422, "rewards/format_reward": 0.5000000093132257, "step": 394 }, { "completion_length": 2799.2708740234375, "epoch": 0.1697555734622616, "grad_norm": 0.6263633370399475, "kl": 0.2833251953125, "learning_rate": 2.1558482853517253e-07, "loss": 0.0279, "reward": 0.3871050952002406, "reward_std": 0.7789968326687813, "rewards/cosine_scaled_reward": -0.09811412380076945, "rewards/format_reward": 0.583333345130086, "step": 395 }, { "completion_length": 2669.041748046875, "epoch": 0.1701853344077357, "grad_norm": 1.599164366722107, "kl": 0.2664794921875, "learning_rate": 2.134908592756607e-07, "loss": 0.0942, "reward": 0.500509912148118, "reward_std": 0.7327203564345837, "rewards/cosine_scaled_reward": -0.06224504951387644, "rewards/format_reward": 0.6250000260770321, "step": 396 }, { "completion_length": 2686.354217529297, "epoch": 0.1706150953532098, "grad_norm": 1.441612958908081, "kl": 0.2506103515625, "learning_rate": 2.1141329099692406e-07, "loss": 0.0558, "reward": 0.38104987842962146, "reward_std": 0.8380836769938469, "rewards/cosine_scaled_reward": -0.028225065441802144, "rewards/format_reward": 0.43750001303851604, "step": 397 }, { "completion_length": 2905.229248046875, "epoch": 0.17104485629868385, "grad_norm": 0.47521936893463135, "kl": 0.381072998046875, "learning_rate": 2.0935222495670968e-07, "loss": 0.0385, "reward": 0.8250126583734527, "reward_std": 0.7463340889662504, "rewards/cosine_scaled_reward": 0.13125632668379694, "rewards/format_reward": 0.5625000093132257, "step": 398 }, { "completion_length": 2684.3959350585938, "epoch": 0.17147461724415794, "grad_norm": 0.47744837403297424, "kl": 0.3723297119140625, "learning_rate": 2.0730776160846853e-07, "loss": 0.022, "reward": 0.21113478182815015, "reward_std": 0.7078599892556667, "rewards/cosine_scaled_reward": -0.15484928153455257, "rewards/format_reward": 0.5208333414047956, "step": 399 }, { "completion_length": 1958.1042022705078, "epoch": 0.17190437818963203, "grad_norm": 0.7218477129936218, "kl": 0.17230224609375, "learning_rate": 2.0528000059645995e-07, "loss": -0.0148, "reward": 0.5492926698643714, "reward_std": 0.6355703882873058, "rewards/cosine_scaled_reward": -0.048270344734191895, "rewards/format_reward": 0.6458333414047956, "step": 400 }, { "completion_length": 2100.812545776367, "epoch": 0.1723341391351061, "grad_norm": 0.7016561627388, "kl": 0.205230712890625, "learning_rate": 2.032690407508949e-07, "loss": 0.0212, "reward": 1.3932212237268686, "reward_std": 0.9624602943658829, "rewards/cosine_scaled_reward": 0.3424439076334238, "rewards/format_reward": 0.7083333488553762, "step": 401 }, { "completion_length": 2636.416717529297, "epoch": 0.17276390008058018, "grad_norm": 0.4849684238433838, "kl": 0.257965087890625, "learning_rate": 2.0127498008311922e-07, "loss": 0.0276, "reward": 1.0405108593404293, "reward_std": 0.6502063646912575, "rewards/cosine_scaled_reward": 0.1348387576872483, "rewards/format_reward": 0.7708333432674408, "step": 402 }, { "completion_length": 3100.5209350585938, "epoch": 0.17319366102605427, "grad_norm": 0.6343851685523987, "kl": 0.3302001953125, "learning_rate": 1.9929791578083655e-07, "loss": 0.024, "reward": 0.25967402197420597, "reward_std": 0.6368421800434589, "rewards/cosine_scaled_reward": -0.20349632622674108, "rewards/format_reward": 0.6666666809469461, "step": 403 }, { "completion_length": 2683.0000915527344, "epoch": 0.17362342197152833, "grad_norm": 0.5882278680801392, "kl": 0.27777099609375, "learning_rate": 1.9733794420337213e-07, "loss": 0.0404, "reward": 0.4491636259481311, "reward_std": 0.7464175038039684, "rewards/cosine_scaled_reward": -0.06708486285060644, "rewards/format_reward": 0.5833333544433117, "step": 404 }, { "completion_length": 2846.666748046875, "epoch": 0.17405318291700242, "grad_norm": 0.7386212944984436, "kl": 0.30413818359375, "learning_rate": 1.9539516087697517e-07, "loss": 0.0287, "reward": 0.3530942751094699, "reward_std": 0.8155166432261467, "rewards/cosine_scaled_reward": -0.14636953687295318, "rewards/format_reward": 0.6458333544433117, "step": 405 }, { "completion_length": 2506.9375610351562, "epoch": 0.1744829438624765, "grad_norm": 0.7193530201911926, "kl": 0.3016357421875, "learning_rate": 1.934696604901642e-07, "loss": 0.025, "reward": 0.7011911384761333, "reward_std": 0.7490771524608135, "rewards/cosine_scaled_reward": 0.027678880840539932, "rewards/format_reward": 0.6458333488553762, "step": 406 }, { "completion_length": 2867.9167709350586, "epoch": 0.17491270480795057, "grad_norm": 0.7153823375701904, "kl": 0.4261474609375, "learning_rate": 1.915615368891117e-07, "loss": 0.0183, "reward": 0.2872799187898636, "reward_std": 0.700722549110651, "rewards/cosine_scaled_reward": -0.14802672062069178, "rewards/format_reward": 0.5833333469927311, "step": 407 }, { "completion_length": 2940.4375915527344, "epoch": 0.17534246575342466, "grad_norm": 1.015720009803772, "kl": 0.346923828125, "learning_rate": 1.8967088307307e-07, "loss": 0.0713, "reward": 0.5467373840510845, "reward_std": 0.9398231208324432, "rewards/cosine_scaled_reward": 0.012952014803886414, "rewards/format_reward": 0.5208333469927311, "step": 408 }, { "completion_length": 2522.854202270508, "epoch": 0.17577222669889875, "grad_norm": 0.8421600461006165, "kl": 0.34344482421875, "learning_rate": 1.8779779118983867e-07, "loss": 0.0544, "reward": 0.743147999048233, "reward_std": 0.8793535307049751, "rewards/cosine_scaled_reward": 0.03824065864318982, "rewards/format_reward": 0.6666666828095913, "step": 409 }, { "completion_length": 2753.312530517578, "epoch": 0.1762019876443728, "grad_norm": 0.9058448672294617, "kl": 0.40960693359375, "learning_rate": 1.8594235253127372e-07, "loss": 0.0091, "reward": 0.0905165399890393, "reward_std": 0.5222691167145967, "rewards/cosine_scaled_reward": -0.2672417350113392, "rewards/format_reward": 0.6250000111758709, "step": 410 }, { "completion_length": 2541.20841217041, "epoch": 0.1766317485898469, "grad_norm": 1.4546343088150024, "kl": 0.371826171875, "learning_rate": 1.8410465752883758e-07, "loss": 0.0881, "reward": 0.8870152927702293, "reward_std": 0.9173106774687767, "rewards/cosine_scaled_reward": 0.0685076410882175, "rewards/format_reward": 0.7500000260770321, "step": 411 }, { "completion_length": 3228.5625915527344, "epoch": 0.177061509535321, "grad_norm": 0.5119374990463257, "kl": 0.523193359375, "learning_rate": 1.822847957491922e-07, "loss": 0.0503, "reward": 0.2971342201344669, "reward_std": 0.7240031324326992, "rewards/cosine_scaled_reward": -0.16393289528787136, "rewards/format_reward": 0.625000013038516, "step": 412 }, { "completion_length": 2615.7084197998047, "epoch": 0.17749127048079505, "grad_norm": 0.601773738861084, "kl": 0.420806884765625, "learning_rate": 1.804828558898332e-07, "loss": 0.0426, "reward": 0.5316141322255135, "reward_std": 0.8799660243093967, "rewards/cosine_scaled_reward": -0.04669295623898506, "rewards/format_reward": 0.6250000186264515, "step": 413 }, { "completion_length": 2597.2709197998047, "epoch": 0.17792103142626914, "grad_norm": 1.6752064228057861, "kl": 0.3209228515625, "learning_rate": 1.7869892577476722e-07, "loss": 0.0771, "reward": 0.7514590043574572, "reward_std": 0.991434171795845, "rewards/cosine_scaled_reward": 0.02156282402575016, "rewards/format_reward": 0.7083333469927311, "step": 414 }, { "completion_length": 2671.9584350585938, "epoch": 0.17835079237174323, "grad_norm": 0.6002082228660583, "kl": 0.3348388671875, "learning_rate": 1.7693309235023127e-07, "loss": 0.0357, "reward": 0.548585768789053, "reward_std": 0.8434926867485046, "rewards/cosine_scaled_reward": -0.09029045142233372, "rewards/format_reward": 0.729166679084301, "step": 415 }, { "completion_length": 2475.8750610351562, "epoch": 0.1787805533172173, "grad_norm": 0.45236167311668396, "kl": 0.36077880859375, "learning_rate": 1.7518544168045524e-07, "loss": 0.0311, "reward": 0.44922177493572235, "reward_std": 0.6414319835603237, "rewards/cosine_scaled_reward": -0.19205578602850437, "rewards/format_reward": 0.833333358168602, "step": 416 }, { "completion_length": 2453.791717529297, "epoch": 0.17921031426269138, "grad_norm": 0.6815130114555359, "kl": 0.3564453125, "learning_rate": 1.7345605894346726e-07, "loss": 0.0327, "reward": 0.7763444241136312, "reward_std": 0.8118596859276295, "rewards/cosine_scaled_reward": 0.02358885295689106, "rewards/format_reward": 0.7291666846722364, "step": 417 }, { "completion_length": 3026.6875762939453, "epoch": 0.17964007520816547, "grad_norm": 1.0501880645751953, "kl": 0.491363525390625, "learning_rate": 1.7174502842694212e-07, "loss": 0.0666, "reward": 0.2365866545587778, "reward_std": 0.8959635570645332, "rewards/cosine_scaled_reward": -0.17337335308548063, "rewards/format_reward": 0.5833333469927311, "step": 418 }, { "completion_length": 2515.2708892822266, "epoch": 0.18006983615363953, "grad_norm": 0.4681876599788666, "kl": 0.4251708984375, "learning_rate": 1.7005243352409333e-07, "loss": 0.0581, "reward": 0.2823067004792392, "reward_std": 0.907500758767128, "rewards/cosine_scaled_reward": -0.11926332162693143, "rewards/format_reward": 0.5208333432674408, "step": 419 }, { "completion_length": 3065.8125915527344, "epoch": 0.18049959709911362, "grad_norm": 0.9973756074905396, "kl": 0.551513671875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0322, "reward": 0.49897839315235615, "reward_std": 0.8605373539030552, "rewards/cosine_scaled_reward": -0.1150941401720047, "rewards/format_reward": 0.729166679084301, "step": 420 }, { "completion_length": 2470.4583740234375, "epoch": 0.1809293580445877, "grad_norm": 1.2621263265609741, "kl": 0.4075927734375, "learning_rate": 1.6672287963562852e-07, "loss": -0.0138, "reward": 0.49474988994188607, "reward_std": 0.6678236424922943, "rewards/cosine_scaled_reward": -0.1380417225882411, "rewards/format_reward": 0.7708333469927311, "step": 421 }, { "completion_length": 2873.416748046875, "epoch": 0.18135911899006177, "grad_norm": 0.4948273301124573, "kl": 0.4202880859375, "learning_rate": 1.6508608292777203e-07, "loss": 0.0492, "reward": 0.41989211132749915, "reward_std": 0.7686353288590908, "rewards/cosine_scaled_reward": -0.14422061573714018, "rewards/format_reward": 0.7083333507180214, "step": 422 }, { "completion_length": 2611.9584045410156, "epoch": 0.18178887993553586, "grad_norm": 0.7511305809020996, "kl": 0.47357177734375, "learning_rate": 1.6346804638120098e-07, "loss": 0.0514, "reward": 0.41774701373651624, "reward_std": 0.9610500112175941, "rewards/cosine_scaled_reward": -0.1452931638341397, "rewards/format_reward": 0.7083333507180214, "step": 423 }, { "completion_length": 2800.479263305664, "epoch": 0.18221864088100995, "grad_norm": 0.9070281386375427, "kl": 0.50396728515625, "learning_rate": 1.6186884885673413e-07, "loss": 0.0303, "reward": 0.4907173467800021, "reward_std": 0.6818160191178322, "rewards/cosine_scaled_reward": -0.15047466894611716, "rewards/format_reward": 0.7916666939854622, "step": 424 }, { "completion_length": 2487.604263305664, "epoch": 0.182648401826484, "grad_norm": 0.5867276191711426, "kl": 0.352325439453125, "learning_rate": 1.6028856829700258e-07, "loss": 0.0214, "reward": 0.510941824875772, "reward_std": 0.7618425861001015, "rewards/cosine_scaled_reward": -0.10911242291331291, "rewards/format_reward": 0.7291666846722364, "step": 425 }, { "completion_length": 2803.5000610351562, "epoch": 0.1830781627719581, "grad_norm": 1.075385332107544, "kl": 0.47003173828125, "learning_rate": 1.5872728172265146e-07, "loss": 0.0117, "reward": 0.44549277424812317, "reward_std": 0.4257977083325386, "rewards/cosine_scaled_reward": -0.1730869635939598, "rewards/format_reward": 0.7916666716337204, "step": 426 }, { "completion_length": 2528.6250610351562, "epoch": 0.1835079237174322, "grad_norm": 0.4283068776130676, "kl": 0.36328125, "learning_rate": 1.5718506522858572e-07, "loss": 0.0374, "reward": 0.11541675357148051, "reward_std": 0.6809480637311935, "rewards/cosine_scaled_reward": -0.19229162158444524, "rewards/format_reward": 0.500000013038516, "step": 427 }, { "completion_length": 2662.0208740234375, "epoch": 0.18393768466290625, "grad_norm": 1.0949524641036987, "kl": 0.344970703125, "learning_rate": 1.5566199398026147e-07, "loss": -0.011, "reward": 0.5776169998571277, "reward_std": 0.637180870398879, "rewards/cosine_scaled_reward": -0.10702482610940933, "rewards/format_reward": 0.791666679084301, "step": 428 }, { "completion_length": 2786.9375610351562, "epoch": 0.18436744560838034, "grad_norm": 0.5168662071228027, "kl": 0.3770751953125, "learning_rate": 1.5415814221002265e-07, "loss": 0.0393, "reward": 0.4133287281729281, "reward_std": 0.5953468251973391, "rewards/cosine_scaled_reward": -0.11625230684876442, "rewards/format_reward": 0.6458333469927311, "step": 429 }, { "completion_length": 3129.1459197998047, "epoch": 0.18479720655385443, "grad_norm": 0.9430156946182251, "kl": 0.5804443359375, "learning_rate": 1.5267358321348285e-07, "loss": 0.0736, "reward": 0.4375925164204091, "reward_std": 0.8850552625954151, "rewards/cosine_scaled_reward": -0.10412042587995529, "rewards/format_reward": 0.6458333507180214, "step": 430 }, { "completion_length": 2824.354248046875, "epoch": 0.1852269674993285, "grad_norm": 0.8342874050140381, "kl": 0.31573486328125, "learning_rate": 1.5120838934595337e-07, "loss": -0.0013, "reward": 0.24317227769643068, "reward_std": 0.5279482360929251, "rewards/cosine_scaled_reward": -0.2325805313885212, "rewards/format_reward": 0.7083333469927311, "step": 431 }, { "completion_length": 2704.6458740234375, "epoch": 0.18565672844480258, "grad_norm": 1.462631344795227, "kl": 0.30755615234375, "learning_rate": 1.4976263201891613e-07, "loss": -0.0359, "reward": 0.1467270995490253, "reward_std": 0.7473117597401142, "rewards/cosine_scaled_reward": -0.18705312628298998, "rewards/format_reward": 0.5208333358168602, "step": 432 }, { "completion_length": 2743.5625610351562, "epoch": 0.18608648939027667, "grad_norm": 0.808102011680603, "kl": 0.3294677734375, "learning_rate": 1.483363816965435e-07, "loss": 0.052, "reward": 0.45853289403021336, "reward_std": 0.9313433095812798, "rewards/cosine_scaled_reward": -0.08323355810716748, "rewards/format_reward": 0.6250000186264515, "step": 433 }, { "completion_length": 2744.8333892822266, "epoch": 0.18651625033575073, "grad_norm": 0.8127678632736206, "kl": 0.381072998046875, "learning_rate": 1.469297078922642e-07, "loss": 0.0005, "reward": 0.4476118441671133, "reward_std": 0.5940674431622028, "rewards/cosine_scaled_reward": -0.05744407698512077, "rewards/format_reward": 0.5625000111758709, "step": 434 }, { "completion_length": 2799.7500762939453, "epoch": 0.18694601128122482, "grad_norm": 0.569139838218689, "kl": 0.42041015625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0321, "reward": 0.16056723170913756, "reward_std": 0.8020668551325798, "rewards/cosine_scaled_reward": -0.200966393109411, "rewards/format_reward": 0.5625000111758709, "step": 435 }, { "completion_length": 3085.6876220703125, "epoch": 0.1873757722266989, "grad_norm": 1.8935147523880005, "kl": 0.46240234375, "learning_rate": 1.4417536311769885e-07, "loss": 0.0911, "reward": 0.5453120078891516, "reward_std": 1.067589782178402, "rewards/cosine_scaled_reward": -0.09192733559757471, "rewards/format_reward": 0.729166679084301, "step": 436 }, { "completion_length": 2758.0208740234375, "epoch": 0.18780553317217297, "grad_norm": 0.7460311055183411, "kl": 0.31573486328125, "learning_rate": 1.4282782639029128e-07, "loss": 0.0296, "reward": 0.3151918649673462, "reward_std": 1.0111035704612732, "rewards/cosine_scaled_reward": -0.15490408131154254, "rewards/format_reward": 0.6250000074505806, "step": 437 }, { "completion_length": 2952.7709197998047, "epoch": 0.18823529411764706, "grad_norm": 0.7957019209861755, "kl": 0.3729248046875, "learning_rate": 1.4150013466019114e-07, "loss": 0.053, "reward": 0.6165231950581074, "reward_std": 0.7473992072045803, "rewards/cosine_scaled_reward": -0.025071759708225727, "rewards/format_reward": 0.6666666716337204, "step": 438 }, { "completion_length": 2890.8126068115234, "epoch": 0.18866505506312115, "grad_norm": 0.7819281220436096, "kl": 0.363037109375, "learning_rate": 1.4019235263722034e-07, "loss": 0.0208, "reward": 0.7752424087375402, "reward_std": 0.8814141675829887, "rewards/cosine_scaled_reward": 0.054287852719426155, "rewards/format_reward": 0.6666666753590107, "step": 439 }, { "completion_length": 3070.2500915527344, "epoch": 0.1890948160085952, "grad_norm": 0.4043211340904236, "kl": 0.3203125, "learning_rate": 1.3890454406082956e-07, "loss": 0.0357, "reward": 0.39229028299450874, "reward_std": 0.6994556039571762, "rewards/cosine_scaled_reward": -0.13718819711357355, "rewards/format_reward": 0.6666666809469461, "step": 440 }, { "completion_length": 2974.104232788086, "epoch": 0.1895245769540693, "grad_norm": 0.5967803001403809, "kl": 0.4581298828125, "learning_rate": 1.3763677169699217e-07, "loss": 0.0483, "reward": 0.4110090620815754, "reward_std": 0.5885596871376038, "rewards/cosine_scaled_reward": -0.12782882029807752, "rewards/format_reward": 0.666666679084301, "step": 441 }, { "completion_length": 2489.7500610351562, "epoch": 0.18995433789954339, "grad_norm": 0.30350273847579956, "kl": 0.29058837890625, "learning_rate": 1.3638909733514452e-07, "loss": 0.0415, "reward": 0.28341099759563804, "reward_std": 0.6520757377147675, "rewards/cosine_scaled_reward": -0.1916278414428234, "rewards/format_reward": 0.6666666828095913, "step": 442 }, { "completion_length": 2708.041763305664, "epoch": 0.19038409884501745, "grad_norm": 0.617853581905365, "kl": 0.326416015625, "learning_rate": 1.351615817851748e-07, "loss": 0.0679, "reward": 0.8461358746280894, "reward_std": 1.0114131048321724, "rewards/cosine_scaled_reward": 0.03765125572681427, "rewards/format_reward": 0.7708333507180214, "step": 443 }, { "completion_length": 2839.1458587646484, "epoch": 0.19081385979049154, "grad_norm": 0.5250259637832642, "kl": 0.526519775390625, "learning_rate": 1.3395428487445914e-07, "loss": 0.0479, "reward": 0.39094297448173165, "reward_std": 0.7004988342523575, "rewards/cosine_scaled_reward": -0.0961951743811369, "rewards/format_reward": 0.5833333432674408, "step": 444 }, { "completion_length": 2737.166748046875, "epoch": 0.19124362073596562, "grad_norm": 0.8876476287841797, "kl": 0.3992919921875, "learning_rate": 1.3276726544494571e-07, "loss": 0.0042, "reward": 0.5218734908849001, "reward_std": 0.8609159514307976, "rewards/cosine_scaled_reward": -0.020313270390033722, "rewards/format_reward": 0.5625000111758709, "step": 445 }, { "completion_length": 3057.916748046875, "epoch": 0.1916733816814397, "grad_norm": 0.696593165397644, "kl": 0.43072509765625, "learning_rate": 1.316005813502869e-07, "loss": 0.068, "reward": 0.2893843688070774, "reward_std": 0.8802371770143509, "rewards/cosine_scaled_reward": -0.12614115793257952, "rewards/format_reward": 0.541666679084301, "step": 446 }, { "completion_length": 2757.3541870117188, "epoch": 0.19210314262691378, "grad_norm": 0.34267979860305786, "kl": 0.2486572265625, "learning_rate": 1.3045428945301953e-07, "loss": 0.0322, "reward": 0.17908588889986277, "reward_std": 0.473357368260622, "rewards/cosine_scaled_reward": -0.2750404067337513, "rewards/format_reward": 0.7291666809469461, "step": 447 }, { "completion_length": 3030.8750610351562, "epoch": 0.19253290357238786, "grad_norm": 0.5487104654312134, "kl": 0.38372802734375, "learning_rate": 1.2932844562179352e-07, "loss": 0.0403, "reward": 0.567463006824255, "reward_std": 0.7069967091083527, "rewards/cosine_scaled_reward": -0.02876850962638855, "rewards/format_reward": 0.6250000204890966, "step": 448 }, { "completion_length": 3144.6458740234375, "epoch": 0.19296266451786193, "grad_norm": 0.8899816870689392, "kl": 0.54486083984375, "learning_rate": 1.2822310472864885e-07, "loss": 0.0373, "reward": 0.3493432765826583, "reward_std": 0.6473804991692305, "rewards/cosine_scaled_reward": -0.033661700785160065, "rewards/format_reward": 0.41666667722165585, "step": 449 }, { "completion_length": 2967.6875610351562, "epoch": 0.19339242546333602, "grad_norm": 1.1761401891708374, "kl": 0.50830078125, "learning_rate": 1.2713832064634125e-07, "loss": 0.0179, "reward": 0.3656523283571005, "reward_std": 0.8386416584253311, "rewards/cosine_scaled_reward": -0.08800717256963253, "rewards/format_reward": 0.5416666772216558, "step": 450 }, { "completion_length": 3208.7709045410156, "epoch": 0.1938221864088101, "grad_norm": 0.5872260928153992, "kl": 0.493896484375, "learning_rate": 1.260741462457165e-07, "loss": 0.0401, "reward": 0.05061859940178692, "reward_std": 0.6998760774731636, "rewards/cosine_scaled_reward": -0.24552404321730137, "rewards/format_reward": 0.5416666809469461, "step": 451 }, { "completion_length": 2558.5625381469727, "epoch": 0.19425194735428417, "grad_norm": 0.3863731920719147, "kl": 0.24945068359375, "learning_rate": 1.2503063339313356e-07, "loss": 0.0065, "reward": 0.4347688741981983, "reward_std": 0.6687708422541618, "rewards/cosine_scaled_reward": -0.09511557966470718, "rewards/format_reward": 0.6250000111758709, "step": 452 }, { "completion_length": 2591.8958892822266, "epoch": 0.19468170829975825, "grad_norm": 0.45399194955825806, "kl": 0.361541748046875, "learning_rate": 1.2400783294793668e-07, "loss": 0.022, "reward": 0.41469383612275124, "reward_std": 0.7296407446265221, "rewards/cosine_scaled_reward": -0.07390309777110815, "rewards/format_reward": 0.5625000111758709, "step": 453 }, { "completion_length": 3167.229278564453, "epoch": 0.19511146924523234, "grad_norm": 0.4943077266216278, "kl": 0.33349609375, "learning_rate": 1.2300579475997657e-07, "loss": 0.0264, "reward": 0.483966744504869, "reward_std": 0.676124133169651, "rewards/cosine_scaled_reward": -0.17468329519033432, "rewards/format_reward": 0.833333358168602, "step": 454 }, { "completion_length": 3020.1459197998047, "epoch": 0.19554123019070643, "grad_norm": 0.5071336627006531, "kl": 0.39501953125, "learning_rate": 1.220245676671809e-07, "loss": 0.0303, "reward": 0.1912369169294834, "reward_std": 0.4916081055998802, "rewards/cosine_scaled_reward": -0.23771488294005394, "rewards/format_reward": 0.666666679084301, "step": 455 }, { "completion_length": 2772.1459045410156, "epoch": 0.1959709911361805, "grad_norm": 0.7353006601333618, "kl": 0.28839111328125, "learning_rate": 1.2106419949317388e-07, "loss": 0.0514, "reward": 0.1047477601096034, "reward_std": 0.71701829880476, "rewards/cosine_scaled_reward": -0.22887613414786756, "rewards/format_reward": 0.5625000111758709, "step": 456 }, { "completion_length": 2745.6875610351562, "epoch": 0.19640075208165458, "grad_norm": 0.6538786292076111, "kl": 0.2647857666015625, "learning_rate": 1.2012473704494537e-07, "loss": 0.0303, "reward": 0.4547817271668464, "reward_std": 0.9189577698707581, "rewards/cosine_scaled_reward": -0.0746924877166748, "rewards/format_reward": 0.6041666809469461, "step": 457 }, { "completion_length": 3207.604248046875, "epoch": 0.19683051302712867, "grad_norm": 0.7676445245742798, "kl": 0.39697265625, "learning_rate": 1.1920622611056974e-07, "loss": 0.0213, "reward": 0.34105163579806685, "reward_std": 0.738530807197094, "rewards/cosine_scaled_reward": -0.15239086980000138, "rewards/format_reward": 0.645833345130086, "step": 458 }, { "completion_length": 3082.5209045410156, "epoch": 0.19726027397260273, "grad_norm": 0.5118699669837952, "kl": 0.359619140625, "learning_rate": 1.1830871145697412e-07, "loss": 0.0171, "reward": 0.5560613758862019, "reward_std": 0.5608845911920071, "rewards/cosine_scaled_reward": -0.06571931671351194, "rewards/format_reward": 0.6875000242143869, "step": 459 }, { "completion_length": 2410.7083740234375, "epoch": 0.19769003491807682, "grad_norm": 0.5219330787658691, "kl": 0.1998291015625, "learning_rate": 1.1743223682775649e-07, "loss": 0.0049, "reward": 0.25343527272343636, "reward_std": 0.792074266821146, "rewards/cosine_scaled_reward": -0.06078235059976578, "rewards/format_reward": 0.3750000037252903, "step": 460 }, { "completion_length": 2907.4584350585938, "epoch": 0.1981197958635509, "grad_norm": 2.0530433654785156, "kl": 0.2987060546875, "learning_rate": 1.1657684494105386e-07, "loss": 0.111, "reward": 0.7132563479244709, "reward_std": 1.004899576306343, "rewards/cosine_scaled_reward": 0.02329482464119792, "rewards/format_reward": 0.6666666846722364, "step": 461 }, { "completion_length": 2502.916763305664, "epoch": 0.19854955680902497, "grad_norm": 0.591495156288147, "kl": 0.31134033203125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0419, "reward": 0.4347365270368755, "reward_std": 0.7134026139974594, "rewards/cosine_scaled_reward": -0.13679842837154865, "rewards/format_reward": 0.7083333507180214, "step": 462 }, { "completion_length": 2764.8125610351562, "epoch": 0.19897931775449906, "grad_norm": 0.28817346692085266, "kl": 0.225830078125, "learning_rate": 1.1492947512799328e-07, "loss": 0.0176, "reward": 0.4127920554019511, "reward_std": 0.6573353633284569, "rewards/cosine_scaled_reward": -0.13735398277640343, "rewards/format_reward": 0.6875000037252903, "step": 463 }, { "completion_length": 2837.2500610351562, "epoch": 0.19940907869997315, "grad_norm": 0.5355004072189331, "kl": 0.3555908203125, "learning_rate": 1.1413757749211602e-07, "loss": 0.0332, "reward": 0.27995091397315264, "reward_std": 0.7207585573196411, "rewards/cosine_scaled_reward": -0.2350245516281575, "rewards/format_reward": 0.7500000223517418, "step": 464 }, { "completion_length": 2835.541732788086, "epoch": 0.1998388396454472, "grad_norm": 0.5600281953811646, "kl": 0.372802734375, "learning_rate": 1.1336692317580158e-07, "loss": 0.0216, "reward": 0.5202487520873547, "reward_std": 0.6662468574941158, "rewards/cosine_scaled_reward": -0.11487563140690327, "rewards/format_reward": 0.750000013038516, "step": 465 }, { "completion_length": 2969.541748046875, "epoch": 0.2002686005909213, "grad_norm": 0.3942917287349701, "kl": 0.3817138671875, "learning_rate": 1.1261754973965422e-07, "loss": 0.0325, "reward": 0.4225296713411808, "reward_std": 0.5472663380205631, "rewards/cosine_scaled_reward": -0.19498517335159704, "rewards/format_reward": 0.8125000204890966, "step": 466 }, { "completion_length": 2717.3958740234375, "epoch": 0.2006983615363954, "grad_norm": 0.7533707618713379, "kl": 0.2984619140625, "learning_rate": 1.1188949370707787e-07, "loss": -0.0244, "reward": 0.6556289289146662, "reward_std": 0.8273407556116581, "rewards/cosine_scaled_reward": 0.004897790960967541, "rewards/format_reward": 0.6458333488553762, "step": 467 }, { "completion_length": 2835.166717529297, "epoch": 0.20112812248186945, "grad_norm": 0.7149420380592346, "kl": 0.2392578125, "learning_rate": 1.1118279056249653e-07, "loss": 0.0488, "reward": 0.3270202912390232, "reward_std": 0.6147549040615559, "rewards/cosine_scaled_reward": -0.13857318460941315, "rewards/format_reward": 0.6041666902601719, "step": 468 }, { "completion_length": 2521.791748046875, "epoch": 0.20155788342734354, "grad_norm": 1.3148115873336792, "kl": 0.19573974609375, "learning_rate": 1.1049747474962444e-07, "loss": 0.019, "reward": 0.8161277137696743, "reward_std": 0.8777006380259991, "rewards/cosine_scaled_reward": 0.04348049499094486, "rewards/format_reward": 0.7291666865348816, "step": 469 }, { "completion_length": 2761.5625610351562, "epoch": 0.20198764437281763, "grad_norm": 1.9801470041275024, "kl": 0.273193359375, "learning_rate": 1.0983357966978745e-07, "loss": 0.1024, "reward": 0.637454379349947, "reward_std": 0.9289520531892776, "rewards/cosine_scaled_reward": -0.056272827088832855, "rewards/format_reward": 0.7500000223517418, "step": 470 }, { "completion_length": 2782.8333740234375, "epoch": 0.2024174053182917, "grad_norm": 0.6316497921943665, "kl": 0.306793212890625, "learning_rate": 1.0919113768029517e-07, "loss": 0.0249, "reward": 0.4109913669526577, "reward_std": 0.7808561101555824, "rewards/cosine_scaled_reward": -0.08617100026458502, "rewards/format_reward": 0.5833333544433117, "step": 471 }, { "completion_length": 2884.2708740234375, "epoch": 0.20284716626376578, "grad_norm": 0.8182538151741028, "kl": 0.25299072265625, "learning_rate": 1.0857018009286381e-07, "loss": -0.0051, "reward": 0.5310811214148998, "reward_std": 0.6887518055737019, "rewards/cosine_scaled_reward": -0.02612612582743168, "rewards/format_reward": 0.5833333488553762, "step": 472 }, { "completion_length": 2920.1875915527344, "epoch": 0.20327692720923987, "grad_norm": 0.7002537250518799, "kl": 0.3231201171875, "learning_rate": 1.0797073717209013e-07, "loss": 0.0055, "reward": 0.17460571718402207, "reward_std": 0.6445187889039516, "rewards/cosine_scaled_reward": -0.23561381362378597, "rewards/format_reward": 0.6458333376795053, "step": 473 }, { "completion_length": 3216.729217529297, "epoch": 0.20370668815471393, "grad_norm": 0.8297850489616394, "kl": 0.4779052734375, "learning_rate": 1.0739283813397639e-07, "loss": 0.0449, "reward": 0.24907919927500188, "reward_std": 0.7195624262094498, "rewards/cosine_scaled_reward": -0.18796042446047068, "rewards/format_reward": 0.6250000204890966, "step": 474 }, { "completion_length": 2985.291778564453, "epoch": 0.20413644910018802, "grad_norm": 0.5859295129776001, "kl": 0.35595703125, "learning_rate": 1.068365111445064e-07, "loss": 0.041, "reward": 0.5817429684102535, "reward_std": 0.7679522298276424, "rewards/cosine_scaled_reward": -0.10496185859665275, "rewards/format_reward": 0.7916667014360428, "step": 475 }, { "completion_length": 2901.0209045410156, "epoch": 0.2045662100456621, "grad_norm": 1.0222387313842773, "kl": 0.2723388671875, "learning_rate": 1.063017833182728e-07, "loss": 0.0542, "reward": 0.2769742552191019, "reward_std": 0.7374824993312359, "rewards/cosine_scaled_reward": -0.11151288542896509, "rewards/format_reward": 0.5000000149011612, "step": 476 }, { "completion_length": 2316.8959045410156, "epoch": 0.20499597099113617, "grad_norm": 0.33235031366348267, "kl": 0.219635009765625, "learning_rate": 1.0578868071715544e-07, "loss": 0.0305, "reward": 1.0450308350846171, "reward_std": 0.5956838317215443, "rewards/cosine_scaled_reward": 0.09543208277318627, "rewards/format_reward": 0.8541666716337204, "step": 477 }, { "completion_length": 2517.0625610351562, "epoch": 0.20542573193661026, "grad_norm": 0.5203970074653625, "kl": 0.189727783203125, "learning_rate": 1.0529722834905125e-07, "loss": -0.0209, "reward": 1.0200487449765205, "reward_std": 0.7888946458697319, "rewards/cosine_scaled_reward": 0.10377435479313135, "rewards/format_reward": 0.8125000149011612, "step": 478 }, { "completion_length": 3118.104278564453, "epoch": 0.20585549288208435, "grad_norm": 1.03202486038208, "kl": 0.2763671875, "learning_rate": 1.0482745016665526e-07, "loss": 0.0043, "reward": 0.22286950796842575, "reward_std": 0.5560217425227165, "rewards/cosine_scaled_reward": -0.23231525160372257, "rewards/format_reward": 0.6875000055879354, "step": 479 }, { "completion_length": 3230.666748046875, "epoch": 0.2062852538275584, "grad_norm": 0.5927737355232239, "kl": 0.329833984375, "learning_rate": 1.0437936906629334e-07, "loss": 0.0341, "reward": 0.4861418139189482, "reward_std": 0.7718168199062347, "rewards/cosine_scaled_reward": -0.06942910025827587, "rewards/format_reward": 0.6250000055879354, "step": 480 }, { "completion_length": 3149.041717529297, "epoch": 0.2067150147730325, "grad_norm": 0.36365607380867004, "kl": 0.405029296875, "learning_rate": 1.0395300688680625e-07, "loss": 0.0384, "reward": -0.004494348540902138, "reward_std": 0.5797578878700733, "rewards/cosine_scaled_reward": -0.2834971733391285, "rewards/format_reward": 0.562500013038516, "step": 481 }, { "completion_length": 3033.2084350585938, "epoch": 0.2071447757185066, "grad_norm": 0.5469386577606201, "kl": 0.2989501953125, "learning_rate": 1.0354838440848501e-07, "loss": 0.0385, "reward": 0.2577262003906071, "reward_std": 0.8128115795552731, "rewards/cosine_scaled_reward": -0.20447024749591947, "rewards/format_reward": 0.6666666865348816, "step": 482 }, { "completion_length": 2760.291748046875, "epoch": 0.20757453666398065, "grad_norm": 0.642526388168335, "kl": 0.28515625, "learning_rate": 1.0316552135205837e-07, "loss": -0.0072, "reward": 0.29480692837387323, "reward_std": 0.6898843199014664, "rewards/cosine_scaled_reward": -0.1442632209509611, "rewards/format_reward": 0.583333345130086, "step": 483 }, { "completion_length": 2724.0625610351562, "epoch": 0.20800429760945474, "grad_norm": 0.7697243094444275, "kl": 0.244384765625, "learning_rate": 1.0280443637773163e-07, "loss": 0.0363, "reward": 1.0742819318547845, "reward_std": 0.6771000511944294, "rewards/cosine_scaled_reward": 0.14130763779394329, "rewards/format_reward": 0.7916666865348816, "step": 484 }, { "completion_length": 2509.166717529297, "epoch": 0.20843405855492883, "grad_norm": 1.162597894668579, "kl": 0.20245361328125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0382, "reward": 1.1048640441149473, "reward_std": 0.7781410440802574, "rewards/cosine_scaled_reward": 0.1774320276454091, "rewards/format_reward": 0.7500000149011612, "step": 485 }, { "completion_length": 2437.6250610351562, "epoch": 0.2088638195004029, "grad_norm": 0.46200332045555115, "kl": 0.229705810546875, "learning_rate": 1.0214767000817596e-07, "loss": 0.0409, "reward": 1.0630637668073177, "reward_std": 0.6511485353112221, "rewards/cosine_scaled_reward": 0.14611523412168026, "rewards/format_reward": 0.7708333414047956, "step": 486 }, { "completion_length": 3081.8334045410156, "epoch": 0.20929358044587698, "grad_norm": 0.8947097659111023, "kl": 0.3155517578125, "learning_rate": 1.0185202062281336e-07, "loss": 0.0654, "reward": 0.8693859986960888, "reward_std": 0.7330617038533092, "rewards/cosine_scaled_reward": 0.05969297050614841, "rewards/format_reward": 0.7500000093132257, "step": 487 }, { "completion_length": 2524.3750534057617, "epoch": 0.20972334139135107, "grad_norm": 0.9244005084037781, "kl": 0.269134521484375, "learning_rate": 1.0157821333772304e-07, "loss": -0.0315, "reward": 0.25587264308705926, "reward_std": 0.8209496177732944, "rewards/cosine_scaled_reward": -0.1845636833459139, "rewards/format_reward": 0.6250000149011612, "step": 488 }, { "completion_length": 2935.354217529297, "epoch": 0.21015310233682513, "grad_norm": 0.6782417893409729, "kl": 0.335693359375, "learning_rate": 1.013262614978859e-07, "loss": -0.0131, "reward": 0.36598102655261755, "reward_std": 0.85771818831563, "rewards/cosine_scaled_reward": -0.10867615917231888, "rewards/format_reward": 0.583333345130086, "step": 489 }, { "completion_length": 2876.8125534057617, "epoch": 0.21058286328229922, "grad_norm": 0.6608256697654724, "kl": 0.3896484375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0461, "reward": 0.07418560422956944, "reward_std": 0.7696478255093098, "rewards/cosine_scaled_reward": -0.19207386672496796, "rewards/format_reward": 0.4583333395421505, "step": 490 }, { "completion_length": 2745.7084045410156, "epoch": 0.2110126242277733, "grad_norm": 0.5124351978302002, "kl": 0.2747955322265625, "learning_rate": 1.0088797220727779e-07, "loss": 0.0311, "reward": 0.42908764933235943, "reward_std": 0.7404365316033363, "rewards/cosine_scaled_reward": -0.1500395181355998, "rewards/format_reward": 0.7291666809469461, "step": 491 }, { "completion_length": 2620.354217529297, "epoch": 0.21144238517324737, "grad_norm": 0.5376548171043396, "kl": 0.4345703125, "learning_rate": 1.0070165611810855e-07, "loss": 0.0497, "reward": 0.38850695826113224, "reward_std": 0.6497341170907021, "rewards/cosine_scaled_reward": -0.1807465385645628, "rewards/format_reward": 0.7500000204890966, "step": 492 }, { "completion_length": 2906.8541870117188, "epoch": 0.21187214611872146, "grad_norm": 0.5724326372146606, "kl": 0.35693359375, "learning_rate": 1.005372381963547e-07, "loss": 0.0347, "reward": 0.33506373316049576, "reward_std": 0.5903512164950371, "rewards/cosine_scaled_reward": -0.13455147296190262, "rewards/format_reward": 0.6041666753590107, "step": 493 }, { "completion_length": 2510.1875762939453, "epoch": 0.21230190706419555, "grad_norm": 0.6593925356864929, "kl": 0.224517822265625, "learning_rate": 1.0039472645551372e-07, "loss": 0.0267, "reward": 0.7605352476239204, "reward_std": 0.7976373545825481, "rewards/cosine_scaled_reward": -0.015565723762847483, "rewards/format_reward": 0.7916666865348816, "step": 494 }, { "completion_length": 2881.7083892822266, "epoch": 0.2127316680096696, "grad_norm": 0.4908088445663452, "kl": 0.2696533203125, "learning_rate": 1.002741278414069e-07, "loss": 0.0467, "reward": 0.47833092603832483, "reward_std": 0.6104793511331081, "rewards/cosine_scaled_reward": -0.14625119976699352, "rewards/format_reward": 0.7708333563059568, "step": 495 }, { "completion_length": 2785.416748046875, "epoch": 0.2131614289551437, "grad_norm": 0.32307371497154236, "kl": 0.26568603515625, "learning_rate": 1.0017544823184055e-07, "loss": 0.0321, "reward": 0.3339098338037729, "reward_std": 0.6954307444393635, "rewards/cosine_scaled_reward": -0.14554508170112967, "rewards/format_reward": 0.6250000186264515, "step": 496 }, { "completion_length": 2617.1459045410156, "epoch": 0.2135911899006178, "grad_norm": 0.43503618240356445, "kl": 0.2720947265625, "learning_rate": 1.0009869243631952e-07, "loss": 0.0276, "reward": 0.39231533324345946, "reward_std": 0.6116465739905834, "rewards/cosine_scaled_reward": -0.12675900547765195, "rewards/format_reward": 0.6458333507180214, "step": 497 }, { "completion_length": 2606.979248046875, "epoch": 0.21402095084609185, "grad_norm": 0.5417603254318237, "kl": 0.254974365234375, "learning_rate": 1.000438641958131e-07, "loss": 0.0061, "reward": 0.8166847922839224, "reward_std": 0.6433618552982807, "rewards/cosine_scaled_reward": 0.03334238799288869, "rewards/format_reward": 0.7500000111758709, "step": 498 }, { "completion_length": 2713.229248046875, "epoch": 0.21445071179156594, "grad_norm": 0.9731401801109314, "kl": 0.348175048828125, "learning_rate": 1.0001096618257236e-07, "loss": 0.0486, "reward": 1.1139494348317385, "reward_std": 0.982919704169035, "rewards/cosine_scaled_reward": 0.19239136576652527, "rewards/format_reward": 0.7291666865348816, "step": 499 }, { "completion_length": 3009.666732788086, "epoch": 0.21488047273704003, "grad_norm": 0.5509792566299438, "kl": 0.2877197265625, "learning_rate": 1e-07, "loss": -0.019, "reward": 0.5718871653079987, "reward_std": 0.8679844252765179, "rewards/cosine_scaled_reward": -0.047389762476086617, "rewards/format_reward": 0.6666666865348816, "step": 500 }, { "epoch": 0.21488047273704003, "step": 500, "total_flos": 0.0, "train_loss": 0.03135233913107368, "train_runtime": 77509.0926, "train_samples_per_second": 0.31, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }