{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2727272727272725, "eval_steps": 500, "global_step": 650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 145.875, "epoch": 0.0034965034965034965, "grad_norm": 8.602514266967773, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 0.7051772475242615, "reward_std": 0.8263505697250366, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3301772475242615, "rewards/reward_search_strategy": 0.0, "step": 1 }, { "completion_length": 88.125, "epoch": 0.006993006993006993, "grad_norm": 1.4985997676849365, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": -0.0, "reward": 1.8388628959655762, "reward_std": 2.5329272747039795, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2388630509376526, "rewards/reward_search_strategy": 0.10000000149011612, "step": 2 }, { "completion_length": 296.5, "epoch": 0.01048951048951049, "grad_norm": 0.7674520611763, "kl": 0.0006381775019690394, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 1.654070258140564, "reward_std": 2.827007532119751, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.17907029390335083, "rewards/reward_search_strategy": 0.22499999403953552, "step": 3 }, { "completion_length": 290.125, "epoch": 0.013986013986013986, "grad_norm": 1.0457236766815186, "kl": 0.0007995082996785641, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 3.421433448791504, "reward_std": 2.9866631031036377, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.396433562040329, "rewards/reward_search_strategy": 0.2750000059604645, "step": 4 }, { "completion_length": 259.125, "epoch": 0.017482517482517484, "grad_norm": 1.3121755123138428, "kl": 0.000728036102373153, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.9268761873245239, "reward_std": 0.7560767531394958, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2268761545419693, "rewards/reward_search_strategy": 0.07500000298023224, "step": 5 }, { "completion_length": 383.5, "epoch": 0.02097902097902098, "grad_norm": 0.9658453464508057, "kl": 0.0007470359560102224, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 1.4594597816467285, "reward_std": 2.2520837783813477, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.25945988297462463, "rewards/reward_search_strategy": 0.07500000298023224, "step": 6 }, { "completion_length": 144.375, "epoch": 0.024475524475524476, "grad_norm": 1.1642646789550781, "kl": 0.0009361266857013106, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": 1.378964900970459, "reward_std": 2.0963218212127686, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30396491289138794, "rewards/reward_search_strategy": 0.07500000298023224, "step": 7 }, { "completion_length": 314.375, "epoch": 0.027972027972027972, "grad_norm": 5.21930456161499, "kl": 0.0008346753311343491, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 1.181166410446167, "reward_std": 1.6016024351119995, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3561664819717407, "rewards/reward_search_strategy": 0.07500000298023224, "step": 8 }, { "completion_length": 98.375, "epoch": 0.03146853146853147, "grad_norm": 1.6168773174285889, "kl": 0.0009058149298653007, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 1.141206979751587, "reward_std": 2.1089890003204346, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16620691120624542, "rewards/reward_search_strategy": 0.10000000149011612, "step": 9 }, { "completion_length": 269.625, "epoch": 0.03496503496503497, "grad_norm": 1.310072660446167, "kl": 0.000750248203985393, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 1.901651382446289, "reward_std": 3.08128023147583, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.27665144205093384, "rewards/reward_search_strategy": 0.125, "step": 10 }, { "completion_length": 311.125, "epoch": 0.038461538461538464, "grad_norm": 1.1051065921783447, "kl": 0.000834438658785075, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 2.1270594596862793, "reward_std": 2.309929132461548, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.377059668302536, "rewards/reward_search_strategy": 0.125, "step": 11 }, { "completion_length": 182.375, "epoch": 0.04195804195804196, "grad_norm": 1.3039528131484985, "kl": 0.0007587745785713196, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.24762853980064392, "reward_std": 0.28513866662979126, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.17262855172157288, "rewards/reward_search_strategy": 0.07500000298023224, "step": 12 }, { "completion_length": 198.5, "epoch": 0.045454545454545456, "grad_norm": 1.176461935043335, "kl": 0.0008510244661010802, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 2.0060770511627197, "reward_std": 2.2957346439361572, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3060770034790039, "rewards/reward_search_strategy": 0.07500000298023224, "step": 13 }, { "completion_length": 158.25, "epoch": 0.04895104895104895, "grad_norm": 1.1484898328781128, "kl": 0.0009028888889588416, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": 0.2598831355571747, "reward_std": 0.4156216084957123, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1348831057548523, "rewards/reward_search_strategy": 0.0, "step": 14 }, { "completion_length": 321.125, "epoch": 0.05244755244755245, "grad_norm": 1.08210027217865, "kl": 0.0007499511120840907, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 2.711674213409424, "reward_std": 3.2537524700164795, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3866744041442871, "rewards/reward_search_strategy": 0.20000000298023224, "step": 15 }, { "completion_length": 176.0, "epoch": 0.055944055944055944, "grad_norm": 1.007460117340088, "kl": 0.0008281145128421485, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 1.6642776727676392, "reward_std": 3.1836605072021484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1892777681350708, "rewards/reward_search_strategy": 0.10000000149011612, "step": 16 }, { "completion_length": 299.0, "epoch": 0.05944055944055944, "grad_norm": 0.9308914542198181, "kl": 0.0006533896084874868, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 2.9600870609283447, "reward_std": 3.002318859100342, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.38508710265159607, "rewards/reward_search_strategy": 0.07500000298023224, "step": 17 }, { "completion_length": 170.125, "epoch": 0.06293706293706294, "grad_norm": 1.6546720266342163, "kl": 0.0009436010150238872, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": 1.807711124420166, "reward_std": 2.873779773712158, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.33271118998527527, "rewards/reward_search_strategy": 0.22499999403953552, "step": 18 }, { "completion_length": 377.125, "epoch": 0.06643356643356643, "grad_norm": 0.7977221608161926, "kl": 0.0008231342071667314, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 0.7038719058036804, "reward_std": 1.2563936710357666, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.22887186706066132, "rewards/reward_search_strategy": 0.10000000149011612, "step": 19 }, { "completion_length": 161.625, "epoch": 0.06993006993006994, "grad_norm": 1.5974496603012085, "kl": 0.000790413178037852, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 2.404862880706787, "reward_std": 2.712550163269043, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.179862841963768, "rewards/reward_search_strategy": 0.10000000149011612, "step": 20 }, { "completion_length": 298.875, "epoch": 0.07342657342657342, "grad_norm": 0.8817616105079651, "kl": 0.0007674504304304719, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": 2.6999998092651367, "reward_std": 3.122270345687866, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.05000000074505806, "rewards/reward_search_strategy": 0.2750000059604645, "step": 21 }, { "completion_length": 150.125, "epoch": 0.07692307692307693, "grad_norm": 1.345902919769287, "kl": 0.0010101046646013856, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 1.2625212669372559, "reward_std": 1.9802589416503906, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.13752126693725586, "rewards/reward_search_strategy": 0.0, "step": 22 }, { "completion_length": 156.625, "epoch": 0.08041958041958042, "grad_norm": 3.1841650009155273, "kl": 0.0009282200480811298, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": 2.532832145690918, "reward_std": 3.623222827911377, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.18283231556415558, "rewards/reward_search_strategy": 0.22500000894069672, "step": 23 }, { "completion_length": 230.625, "epoch": 0.08391608391608392, "grad_norm": 0.560590922832489, "kl": 0.0008200581069104373, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.7196725606918335, "reward_std": 1.5131947994232178, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1946725845336914, "rewards/reward_search_strategy": 0.02500000037252903, "step": 24 }, { "completion_length": 232.5, "epoch": 0.08741258741258741, "grad_norm": 5.627699375152588, "kl": 0.0007359281880781054, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 2.3161444664001465, "reward_std": 2.266465425491333, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.31614452600479126, "rewards/reward_search_strategy": 0.0, "step": 25 }, { "completion_length": 256.625, "epoch": 0.09090909090909091, "grad_norm": 1.243622064590454, "kl": 0.0009776452789083123, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 1.8542793989181519, "reward_std": 2.3880927562713623, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.22927939891815186, "rewards/reward_search_strategy": 0.0, "step": 26 }, { "completion_length": 221.625, "epoch": 0.0944055944055944, "grad_norm": 2.455430030822754, "kl": 0.0014260835014283657, "learning_rate": 1.3500000000000002e-06, "loss": 0.0001, "reward": 1.0012282133102417, "reward_std": 1.7000349760055542, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.15122826397418976, "rewards/reward_search_strategy": 0.10000000149011612, "step": 27 }, { "completion_length": 185.75, "epoch": 0.0979020979020979, "grad_norm": 1.199497938156128, "kl": 0.0007826727814972401, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 2.173332691192627, "reward_std": 2.801347255706787, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.32333269715309143, "rewards/reward_search_strategy": 0.10000000149011612, "step": 28 }, { "completion_length": 243.125, "epoch": 0.10139860139860139, "grad_norm": 0.8430306315422058, "kl": 0.0008962135761976242, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 1.0625323057174683, "reward_std": 1.898645043373108, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18753226101398468, "rewards/reward_search_strategy": 0.0, "step": 29 }, { "completion_length": 334.875, "epoch": 0.1048951048951049, "grad_norm": 0.8412113785743713, "kl": 0.001170428702607751, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 1.303612232208252, "reward_std": 2.1826775074005127, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.27861225605010986, "rewards/reward_search_strategy": 0.02500000037252903, "step": 30 }, { "completion_length": 165.75, "epoch": 0.10839160839160839, "grad_norm": 1.9105679988861084, "kl": 0.0031302073039114475, "learning_rate": 1.5500000000000002e-06, "loss": 0.0001, "reward": 1.0198408365249634, "reward_std": 2.1316380500793457, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16984085738658905, "rewards/reward_search_strategy": 0.10000000149011612, "step": 31 }, { "completion_length": 174.625, "epoch": 0.11188811188811189, "grad_norm": 2.309382915496826, "kl": 0.0032356895972043276, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "reward": 0.9216341376304626, "reward_std": 1.8671420812606812, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14663413166999817, "rewards/reward_search_strategy": 0.02500000037252903, "step": 32 }, { "completion_length": 85.75, "epoch": 0.11538461538461539, "grad_norm": 1.4122040271759033, "kl": 0.0017049856251105666, "learning_rate": 1.6500000000000003e-06, "loss": 0.0001, "reward": 1.524999976158142, "reward_std": 2.2926902770996094, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.10000000149011612, "rewards/reward_search_strategy": 0.05000000074505806, "step": 33 }, { "completion_length": 173.5, "epoch": 0.11888111888111888, "grad_norm": 0.9712541699409485, "kl": 0.0014842856908217072, "learning_rate": 1.7000000000000002e-06, "loss": 0.0001, "reward": 3.4844837188720703, "reward_std": 3.1145381927490234, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3094840347766876, "rewards/reward_search_strategy": 0.17499999701976776, "step": 34 }, { "completion_length": 268.25, "epoch": 0.12237762237762238, "grad_norm": 1.1684075593948364, "kl": 0.0012478481512516737, "learning_rate": 1.75e-06, "loss": 0.0, "reward": 2.8716678619384766, "reward_std": 2.387646198272705, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3716679811477661, "rewards/reward_search_strategy": 0.125, "step": 35 }, { "completion_length": 235.125, "epoch": 0.1258741258741259, "grad_norm": 1.2749513387680054, "kl": 0.004472827073186636, "learning_rate": 1.8000000000000001e-06, "loss": 0.0002, "reward": 1.8775699138641357, "reward_std": 2.413510322570801, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30256983637809753, "rewards/reward_search_strategy": 0.07500000298023224, "step": 36 }, { "completion_length": 175.5, "epoch": 0.12937062937062938, "grad_norm": 0.8929537534713745, "kl": 0.0037049425300210714, "learning_rate": 1.85e-06, "loss": 0.0001, "reward": 0.6821428537368774, "reward_std": 1.4377206563949585, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18214285373687744, "rewards/reward_search_strategy": 0.0, "step": 37 }, { "completion_length": 290.125, "epoch": 0.13286713286713286, "grad_norm": 2.52665114402771, "kl": 0.005157872103154659, "learning_rate": 1.9000000000000002e-06, "loss": 0.0002, "reward": 2.574741840362549, "reward_std": 2.5146002769470215, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14974190294742584, "rewards/reward_search_strategy": 0.17500001192092896, "step": 38 }, { "completion_length": 344.125, "epoch": 0.13636363636363635, "grad_norm": 0.8259175419807434, "kl": 0.002582128159701824, "learning_rate": 1.9500000000000004e-06, "loss": 0.0001, "reward": 2.376612901687622, "reward_std": 2.9910624027252197, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4266127943992615, "rewards/reward_search_strategy": 0.07500000298023224, "step": 39 }, { "completion_length": 243.125, "epoch": 0.13986013986013987, "grad_norm": 1.0021271705627441, "kl": 0.004831339232623577, "learning_rate": 2.0000000000000003e-06, "loss": 0.0002, "reward": 2.04349946975708, "reward_std": 2.3903656005859375, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.34349945187568665, "rewards/reward_search_strategy": 0.07500000298023224, "step": 40 }, { "completion_length": 207.375, "epoch": 0.14335664335664336, "grad_norm": 1.2446260452270508, "kl": 0.005710378754884005, "learning_rate": 2.05e-06, "loss": 0.0002, "reward": 2.8324475288391113, "reward_std": 3.167931079864502, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.25744765996932983, "rewards/reward_search_strategy": 0.20000000298023224, "step": 41 }, { "completion_length": 441.125, "epoch": 0.14685314685314685, "grad_norm": 0.8263920545578003, "kl": 0.0030913222581148148, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "reward": 2.5794529914855957, "reward_std": 2.6344144344329834, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30445292592048645, "rewards/reward_search_strategy": 0.15000000596046448, "step": 42 }, { "completion_length": 226.375, "epoch": 0.15034965034965034, "grad_norm": 1.141964316368103, "kl": 0.008909309282898903, "learning_rate": 2.15e-06, "loss": 0.0004, "reward": 1.8603672981262207, "reward_std": 2.203197479248047, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.21036729216575623, "rewards/reward_search_strategy": 0.02500000037252903, "step": 43 }, { "completion_length": 397.75, "epoch": 0.15384615384615385, "grad_norm": 0.9564006328582764, "kl": 0.01092858798801899, "learning_rate": 2.2e-06, "loss": 0.0004, "reward": 2.557056188583374, "reward_std": 2.429002046585083, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.28205621242523193, "rewards/reward_search_strategy": 0.02500000037252903, "step": 44 }, { "completion_length": 307.25, "epoch": 0.15734265734265734, "grad_norm": 1.0510841608047485, "kl": 0.007918575778603554, "learning_rate": 2.25e-06, "loss": 0.0003, "reward": 1.2331278324127197, "reward_std": 1.7548075914382935, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.15812794864177704, "rewards/reward_search_strategy": 0.07500000298023224, "step": 45 }, { "completion_length": 277.0, "epoch": 0.16083916083916083, "grad_norm": 0.9169591665267944, "kl": 0.015062836930155754, "learning_rate": 2.3000000000000004e-06, "loss": 0.0006, "reward": 1.2089338302612305, "reward_std": 2.5115318298339844, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1339338719844818, "rewards/reward_search_strategy": 0.20000000298023224, "step": 46 }, { "completion_length": 206.25, "epoch": 0.16433566433566432, "grad_norm": 1.3427180051803589, "kl": 0.019624339416623116, "learning_rate": 2.35e-06, "loss": 0.0008, "reward": 4.1133527755737305, "reward_std": 3.2583820819854736, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.36335283517837524, "rewards/reward_search_strategy": 0.125, "step": 47 }, { "completion_length": 252.125, "epoch": 0.16783216783216784, "grad_norm": 0.7444542050361633, "kl": 0.021463895216584206, "learning_rate": 2.4000000000000003e-06, "loss": 0.0009, "reward": 1.631263017654419, "reward_std": 2.5925517082214355, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.23126289248466492, "rewards/reward_search_strategy": 0.02500000037252903, "step": 48 }, { "completion_length": 117.625, "epoch": 0.17132867132867133, "grad_norm": 1.888673186302185, "kl": 0.03609447553753853, "learning_rate": 2.4500000000000003e-06, "loss": 0.0014, "reward": 1.9036250114440918, "reward_std": 2.2273151874542236, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.20362500846385956, "rewards/reward_search_strategy": 0.07500000298023224, "step": 49 }, { "completion_length": 302.75, "epoch": 0.17482517482517482, "grad_norm": 1.2723031044006348, "kl": 0.2012804001569748, "learning_rate": 2.5e-06, "loss": 0.0081, "reward": 0.15956448018550873, "reward_std": 0.4513165056705475, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.03456447646021843, "rewards/reward_search_strategy": 0.0, "step": 50 }, { "completion_length": 222.0, "epoch": 0.17832167832167833, "grad_norm": 0.8149501085281372, "kl": 0.04784020781517029, "learning_rate": 2.55e-06, "loss": 0.0019, "reward": 0.49046826362609863, "reward_std": 0.9328532814979553, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1654682457447052, "rewards/reward_search_strategy": 0.07500000298023224, "step": 51 }, { "completion_length": 329.75, "epoch": 0.18181818181818182, "grad_norm": 1.0090314149856567, "kl": 0.03431350365281105, "learning_rate": 2.6e-06, "loss": 0.0014, "reward": 1.8717395067214966, "reward_std": 2.0185225009918213, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.246739462018013, "rewards/reward_search_strategy": 0.25, "step": 52 }, { "completion_length": 292.5, "epoch": 0.1853146853146853, "grad_norm": 0.9738456606864929, "kl": 0.03869509696960449, "learning_rate": 2.6500000000000005e-06, "loss": 0.0015, "reward": 3.411203384399414, "reward_std": 2.776095390319824, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.28620344400405884, "rewards/reward_search_strategy": 0.0, "step": 53 }, { "completion_length": 262.25, "epoch": 0.1888111888111888, "grad_norm": 1.0264172554016113, "kl": 0.04221673682332039, "learning_rate": 2.7000000000000004e-06, "loss": 0.0017, "reward": 3.2230210304260254, "reward_std": 3.179140329360962, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.2730211913585663, "rewards/reward_search_strategy": 0.32500001788139343, "step": 54 }, { "completion_length": 278.0, "epoch": 0.19230769230769232, "grad_norm": 1.219490885734558, "kl": 0.07308313995599747, "learning_rate": 2.7500000000000004e-06, "loss": 0.0029, "reward": 2.9714736938476562, "reward_std": 3.078878164291382, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.32147371768951416, "rewards/reward_search_strategy": 0.2750000059604645, "step": 55 }, { "completion_length": 198.125, "epoch": 0.1958041958041958, "grad_norm": 2.261676788330078, "kl": 0.10734312981367111, "learning_rate": 2.8000000000000003e-06, "loss": 0.0043, "reward": 1.751394271850586, "reward_std": 2.916086196899414, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.35139432549476624, "rewards/reward_search_strategy": 0.15000000596046448, "step": 56 }, { "completion_length": 368.25, "epoch": 0.1993006993006993, "grad_norm": 1.0139119625091553, "kl": 0.07945723086595535, "learning_rate": 2.85e-06, "loss": 0.0032, "reward": 2.6666977405548096, "reward_std": 3.0744428634643555, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1416977047920227, "rewards/reward_search_strategy": 0.15000000596046448, "step": 57 }, { "completion_length": 445.875, "epoch": 0.20279720279720279, "grad_norm": 1.2591164112091064, "kl": 0.0817643404006958, "learning_rate": 2.9e-06, "loss": 0.0033, "reward": 3.105250358581543, "reward_std": 2.503899574279785, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4302505850791931, "rewards/reward_search_strategy": 0.17499999701976776, "step": 58 }, { "completion_length": 465.875, "epoch": 0.2062937062937063, "grad_norm": 0.6122844219207764, "kl": 0.0703798234462738, "learning_rate": 2.95e-06, "loss": 0.0028, "reward": 1.6243177652359009, "reward_std": 1.8378466367721558, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.37431782484054565, "rewards/reward_search_strategy": 0.125, "step": 59 }, { "completion_length": 256.375, "epoch": 0.2097902097902098, "grad_norm": 0.9197457432746887, "kl": 0.08273329585790634, "learning_rate": 3e-06, "loss": 0.0033, "reward": 2.420247793197632, "reward_std": 2.3122122287750244, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4202479422092438, "rewards/reward_search_strategy": 0.25, "step": 60 }, { "completion_length": 517.125, "epoch": 0.21328671328671328, "grad_norm": 0.8799907565116882, "kl": 0.20334148406982422, "learning_rate": 3.05e-06, "loss": 0.0081, "reward": 1.377384066581726, "reward_std": 1.68935227394104, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3023841083049774, "rewards/reward_search_strategy": 0.07500000298023224, "step": 61 }, { "completion_length": 502.5, "epoch": 0.21678321678321677, "grad_norm": 0.7109338641166687, "kl": 0.03114498406648636, "learning_rate": 3.1000000000000004e-06, "loss": 0.0012, "reward": 5.466271877288818, "reward_std": 2.7285211086273193, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.6912716627120972, "rewards/reward_search_strategy": 0.4000000059604645, "step": 62 }, { "completion_length": 503.25, "epoch": 0.2202797202797203, "grad_norm": 0.9971331357955933, "kl": 0.20209850370883942, "learning_rate": 3.1500000000000003e-06, "loss": 0.0081, "reward": 1.2392462491989136, "reward_std": 1.4006211757659912, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.189246267080307, "rewards/reward_search_strategy": 0.17500001192092896, "step": 63 }, { "completion_length": 288.5, "epoch": 0.22377622377622378, "grad_norm": 0.8857221603393555, "kl": 0.10349184274673462, "learning_rate": 3.2000000000000003e-06, "loss": 0.0041, "reward": 1.3573650121688843, "reward_std": 1.9148555994033813, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.40736502408981323, "rewards/reward_search_strategy": 0.07500000298023224, "step": 64 }, { "completion_length": 445.875, "epoch": 0.22727272727272727, "grad_norm": 2.050611734390259, "kl": 0.9011820554733276, "learning_rate": 3.2500000000000002e-06, "loss": 0.036, "reward": 2.7885334491729736, "reward_std": 2.6393096446990967, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.46353352069854736, "rewards/reward_search_strategy": 0.20000000298023224, "step": 65 }, { "completion_length": 321.75, "epoch": 0.23076923076923078, "grad_norm": 0.8712911605834961, "kl": 0.11524824798107147, "learning_rate": 3.3000000000000006e-06, "loss": 0.0046, "reward": 2.963777542114258, "reward_std": 2.7310569286346436, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.21377724409103394, "rewards/reward_search_strategy": 0.125, "step": 66 }, { "completion_length": 579.5, "epoch": 0.23426573426573427, "grad_norm": 0.6312026977539062, "kl": 0.08477935940027237, "learning_rate": 3.3500000000000005e-06, "loss": 0.0034, "reward": 4.128048896789551, "reward_std": 2.959887742996216, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.30304890871047974, "rewards/reward_search_strategy": 0.20000000298023224, "step": 67 }, { "completion_length": 256.625, "epoch": 0.23776223776223776, "grad_norm": 0.835229218006134, "kl": 0.1031595766544342, "learning_rate": 3.4000000000000005e-06, "loss": 0.0041, "reward": 5.231247425079346, "reward_std": 2.7173686027526855, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.48124733567237854, "rewards/reward_search_strategy": 0.25, "step": 68 }, { "completion_length": 458.125, "epoch": 0.24125874125874125, "grad_norm": 1.0669630765914917, "kl": 0.050319548696279526, "learning_rate": 3.45e-06, "loss": 0.002, "reward": 2.627213716506958, "reward_std": 2.752366781234741, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1022137925028801, "rewards/reward_search_strategy": 0.4000000059604645, "step": 69 }, { "completion_length": 758.125, "epoch": 0.24475524475524477, "grad_norm": 0.4896698594093323, "kl": 0.03787853941321373, "learning_rate": 3.5e-06, "loss": 0.0015, "reward": 3.715177059173584, "reward_std": 2.108518362045288, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.44017714262008667, "rewards/reward_search_strategy": 0.3999999761581421, "step": 70 }, { "completion_length": 359.625, "epoch": 0.24825174825174826, "grad_norm": 1.1720679998397827, "kl": 0.09393578767776489, "learning_rate": 3.5500000000000003e-06, "loss": 0.0038, "reward": 5.1651530265808105, "reward_std": 2.7064919471740723, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.39015328884124756, "rewards/reward_search_strategy": 0.5249999761581421, "step": 71 }, { "completion_length": 603.75, "epoch": 0.2517482517482518, "grad_norm": 0.92381352186203, "kl": 0.03434322774410248, "learning_rate": 3.6000000000000003e-06, "loss": 0.0014, "reward": 1.6090435981750488, "reward_std": 1.3871439695358276, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.5090435743331909, "rewards/reward_search_strategy": 0.22500000894069672, "step": 72 }, { "completion_length": 324.125, "epoch": 0.25524475524475526, "grad_norm": 0.8183465003967285, "kl": 0.09460737556219101, "learning_rate": 3.65e-06, "loss": 0.0038, "reward": 1.5781946182250977, "reward_std": 2.772611618041992, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.25319457054138184, "rewards/reward_search_strategy": 0.07500000298023224, "step": 73 }, { "completion_length": 479.5, "epoch": 0.25874125874125875, "grad_norm": 0.8604242205619812, "kl": 0.053619399666786194, "learning_rate": 3.7e-06, "loss": 0.0021, "reward": 4.117947578430176, "reward_std": 2.7907135486602783, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.41794782876968384, "rewards/reward_search_strategy": 0.20000000298023224, "step": 74 }, { "completion_length": 450.25, "epoch": 0.26223776223776224, "grad_norm": 2.1643834114074707, "kl": 0.19324736297130585, "learning_rate": 3.7500000000000005e-06, "loss": 0.0077, "reward": 4.815071105957031, "reward_std": 3.0326616764068604, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39007121324539185, "rewards/reward_search_strategy": 0.42500001192092896, "step": 75 }, { "completion_length": 412.25, "epoch": 0.26573426573426573, "grad_norm": 0.8062530159950256, "kl": 0.0793648362159729, "learning_rate": 3.8000000000000005e-06, "loss": 0.0032, "reward": 3.5049312114715576, "reward_std": 2.922109603881836, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5549312829971313, "rewards/reward_search_strategy": 0.20000000298023224, "step": 76 }, { "completion_length": 369.75, "epoch": 0.2692307692307692, "grad_norm": 1.467880368232727, "kl": 0.0835878923535347, "learning_rate": 3.85e-06, "loss": 0.0033, "reward": 2.802757978439331, "reward_std": 2.5307087898254395, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3277580440044403, "rewards/reward_search_strategy": 0.22500000894069672, "step": 77 }, { "completion_length": 382.375, "epoch": 0.2727272727272727, "grad_norm": 4.771082401275635, "kl": 0.9713490605354309, "learning_rate": 3.900000000000001e-06, "loss": 0.0389, "reward": 2.052755117416382, "reward_std": 3.0059754848480225, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3777550160884857, "rewards/reward_search_strategy": 0.30000001192092896, "step": 78 }, { "completion_length": 546.125, "epoch": 0.2762237762237762, "grad_norm": 0.48559486865997314, "kl": 0.037933606654405594, "learning_rate": 3.95e-06, "loss": 0.0015, "reward": 4.67194938659668, "reward_std": 1.9323244094848633, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5219494700431824, "rewards/reward_search_strategy": 0.3999999761581421, "step": 79 }, { "completion_length": 677.25, "epoch": 0.27972027972027974, "grad_norm": 0.42382290959358215, "kl": 0.0389033704996109, "learning_rate": 4.000000000000001e-06, "loss": 0.0016, "reward": 1.6928536891937256, "reward_std": 2.6118526458740234, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.367853581905365, "rewards/reward_search_strategy": 0.20000000298023224, "step": 80 }, { "completion_length": 472.125, "epoch": 0.28321678321678323, "grad_norm": 0.8816371560096741, "kl": 0.07524556666612625, "learning_rate": 4.05e-06, "loss": 0.003, "reward": 3.7551965713500977, "reward_std": 2.652488946914673, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4801965653896332, "rewards/reward_search_strategy": 0.15000000596046448, "step": 81 }, { "completion_length": 1057.125, "epoch": 0.2867132867132867, "grad_norm": 0.5135111808776855, "kl": 0.03152136504650116, "learning_rate": 4.1e-06, "loss": 0.0013, "reward": 3.2323527336120605, "reward_std": 1.7208728790283203, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.682352602481842, "rewards/reward_search_strategy": 0.42500001192092896, "step": 82 }, { "completion_length": 485.375, "epoch": 0.2902097902097902, "grad_norm": 0.9151804447174072, "kl": 0.059312522411346436, "learning_rate": 4.15e-06, "loss": 0.0024, "reward": 1.5200467109680176, "reward_std": 1.384164810180664, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4450467824935913, "rewards/reward_search_strategy": 0.07500000298023224, "step": 83 }, { "completion_length": 703.25, "epoch": 0.2937062937062937, "grad_norm": 0.708080530166626, "kl": 0.02395041473209858, "learning_rate": 4.2000000000000004e-06, "loss": 0.001, "reward": 4.584571838378906, "reward_std": 2.8503193855285645, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4345718026161194, "rewards/reward_search_strategy": 0.4000000059604645, "step": 84 }, { "completion_length": 507.75, "epoch": 0.2972027972027972, "grad_norm": 0.9842731356620789, "kl": 0.045889340341091156, "learning_rate": 4.25e-06, "loss": 0.0018, "reward": 6.216353893280029, "reward_std": 3.617387056350708, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4663536846637726, "rewards/reward_search_strategy": 0.75, "step": 85 }, { "completion_length": 362.75, "epoch": 0.3006993006993007, "grad_norm": 5.553736686706543, "kl": 0.31082066893577576, "learning_rate": 4.3e-06, "loss": 0.0124, "reward": 2.6352591514587402, "reward_std": 3.6418111324310303, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.18525923788547516, "rewards/reward_search_strategy": 0.32499998807907104, "step": 86 }, { "completion_length": 760.625, "epoch": 0.3041958041958042, "grad_norm": 0.40838149189949036, "kl": 0.03904329240322113, "learning_rate": 4.350000000000001e-06, "loss": 0.0016, "reward": 2.3138082027435303, "reward_std": 2.5393922328948975, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.43880823254585266, "rewards/reward_search_strategy": 0.375, "step": 87 }, { "completion_length": 233.875, "epoch": 0.3076923076923077, "grad_norm": 1.6991461515426636, "kl": 0.11344132572412491, "learning_rate": 4.4e-06, "loss": 0.0045, "reward": 5.137988090515137, "reward_std": 3.6587538719177246, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.28798776865005493, "rewards/reward_search_strategy": 0.4750000238418579, "step": 88 }, { "completion_length": 634.0, "epoch": 0.3111888111888112, "grad_norm": 1.008798599243164, "kl": 0.13237585127353668, "learning_rate": 4.450000000000001e-06, "loss": 0.0053, "reward": 2.58494234085083, "reward_std": 3.3475492000579834, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30994224548339844, "rewards/reward_search_strategy": 0.3999999761581421, "step": 89 }, { "completion_length": 673.875, "epoch": 0.3146853146853147, "grad_norm": 0.38143813610076904, "kl": 0.04431832954287529, "learning_rate": 4.5e-06, "loss": 0.0018, "reward": 3.379831552505493, "reward_std": 2.6894783973693848, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5048315525054932, "rewards/reward_search_strategy": 0.375, "step": 90 }, { "completion_length": 335.5, "epoch": 0.3181818181818182, "grad_norm": 1.326707124710083, "kl": 0.1567000150680542, "learning_rate": 4.5500000000000005e-06, "loss": 0.0063, "reward": 1.540712594985962, "reward_std": 1.9064555168151855, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.24071267247200012, "rewards/reward_search_strategy": 0.30000001192092896, "step": 91 }, { "completion_length": 479.25, "epoch": 0.32167832167832167, "grad_norm": 0.8233426213264465, "kl": 0.046305958181619644, "learning_rate": 4.600000000000001e-06, "loss": 0.0019, "reward": 4.979320526123047, "reward_std": 2.5221211910247803, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6293203234672546, "rewards/reward_search_strategy": 0.3500000238418579, "step": 92 }, { "completion_length": 529.625, "epoch": 0.32517482517482516, "grad_norm": 0.8791444301605225, "kl": 0.049900904297828674, "learning_rate": 4.65e-06, "loss": 0.002, "reward": 4.73832893371582, "reward_std": 2.957406520843506, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5633291006088257, "rewards/reward_search_strategy": 0.550000011920929, "step": 93 }, { "completion_length": 566.25, "epoch": 0.32867132867132864, "grad_norm": 0.93157559633255, "kl": 0.10382921993732452, "learning_rate": 4.7e-06, "loss": 0.0042, "reward": 3.921233654022217, "reward_std": 3.259979486465454, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39623361825942993, "rewards/reward_search_strategy": 0.5249999761581421, "step": 94 }, { "completion_length": 405.0, "epoch": 0.3321678321678322, "grad_norm": 0.6553046703338623, "kl": 0.05014060437679291, "learning_rate": 4.75e-06, "loss": 0.002, "reward": 4.048189163208008, "reward_std": 3.046299934387207, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3731893002986908, "rewards/reward_search_strategy": 0.550000011920929, "step": 95 }, { "completion_length": 559.0, "epoch": 0.3356643356643357, "grad_norm": 0.9019679427146912, "kl": 0.1154535710811615, "learning_rate": 4.800000000000001e-06, "loss": 0.0046, "reward": 3.3237497806549072, "reward_std": 2.755406379699707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.19874979555606842, "rewards/reward_search_strategy": 0.375, "step": 96 }, { "completion_length": 513.125, "epoch": 0.33916083916083917, "grad_norm": 1.1792876720428467, "kl": 0.20403671264648438, "learning_rate": 4.85e-06, "loss": 0.0082, "reward": 4.584352970123291, "reward_std": 3.532867908477783, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4093528091907501, "rewards/reward_search_strategy": 0.550000011920929, "step": 97 }, { "completion_length": 390.75, "epoch": 0.34265734265734266, "grad_norm": 24.102558135986328, "kl": 1.4576557874679565, "learning_rate": 4.9000000000000005e-06, "loss": 0.0583, "reward": 3.2650551795959473, "reward_std": 1.5778415203094482, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4150553047657013, "rewards/reward_search_strategy": 0.4749999940395355, "step": 98 }, { "completion_length": 992.0, "epoch": 0.34615384615384615, "grad_norm": 1.07889723777771, "kl": 0.15134873986244202, "learning_rate": 4.95e-06, "loss": 0.0061, "reward": 3.4669156074523926, "reward_std": 3.278291940689087, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.31691551208496094, "rewards/reward_search_strategy": 0.4000000059604645, "step": 99 }, { "completion_length": 839.625, "epoch": 0.34965034965034963, "grad_norm": 0.5948315262794495, "kl": 0.03611273318529129, "learning_rate": 5e-06, "loss": 0.0014, "reward": 2.273786783218384, "reward_std": 1.3839948177337646, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5487868189811707, "rewards/reward_search_strategy": 0.4750000238418579, "step": 100 }, { "completion_length": 658.5, "epoch": 0.3531468531468531, "grad_norm": 0.6043696999549866, "kl": 0.07113679498434067, "learning_rate": 4.999984769144476e-06, "loss": 0.0028, "reward": 4.308359622955322, "reward_std": 2.9922401905059814, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5333598256111145, "rewards/reward_search_strategy": 0.5250000357627869, "step": 101 }, { "completion_length": 658.375, "epoch": 0.35664335664335667, "grad_norm": 0.6331404447555542, "kl": 0.06461314111948013, "learning_rate": 4.999939076763487e-06, "loss": 0.0026, "reward": 4.334439277648926, "reward_std": 3.60390567779541, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3594394326210022, "rewards/reward_search_strategy": 0.6000000238418579, "step": 102 }, { "completion_length": 549.875, "epoch": 0.36013986013986016, "grad_norm": 0.9000433087348938, "kl": 0.18242445588111877, "learning_rate": 4.999862923413781e-06, "loss": 0.0073, "reward": 3.720637559890747, "reward_std": 3.3442742824554443, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3706376552581787, "rewards/reward_search_strategy": 0.4750000238418579, "step": 103 }, { "completion_length": 560.875, "epoch": 0.36363636363636365, "grad_norm": 0.9529889822006226, "kl": 0.13583947718143463, "learning_rate": 4.999756310023261e-06, "loss": 0.0054, "reward": 5.090951442718506, "reward_std": 3.163343906402588, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.41595128178596497, "rewards/reward_search_strategy": 0.42500001192092896, "step": 104 }, { "completion_length": 784.5, "epoch": 0.36713286713286714, "grad_norm": 0.5621578693389893, "kl": 0.06874603033065796, "learning_rate": 4.9996192378909785e-06, "loss": 0.0027, "reward": 1.6759297847747803, "reward_std": 1.3889375925064087, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4009298086166382, "rewards/reward_search_strategy": 0.2750000059604645, "step": 105 }, { "completion_length": 811.875, "epoch": 0.3706293706293706, "grad_norm": 0.3865280747413635, "kl": 0.03063635341823101, "learning_rate": 4.999451708687114e-06, "loss": 0.0012, "reward": 1.8364933729171753, "reward_std": 1.0776033401489258, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.186493381857872, "rewards/reward_search_strategy": 0.6499999761581421, "step": 106 }, { "completion_length": 767.0, "epoch": 0.3741258741258741, "grad_norm": 1.0235356092453003, "kl": 0.05863216146826744, "learning_rate": 4.9992537244529585e-06, "loss": 0.0023, "reward": 3.36350154876709, "reward_std": 2.900977611541748, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43850138783454895, "rewards/reward_search_strategy": 0.6749999523162842, "step": 107 }, { "completion_length": 471.125, "epoch": 0.3776223776223776, "grad_norm": 1.1427947282791138, "kl": 0.05708540976047516, "learning_rate": 4.999025287600886e-06, "loss": 0.0023, "reward": 3.4347455501556396, "reward_std": 3.3557960987091064, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.28474536538124084, "rewards/reward_search_strategy": 0.4000000059604645, "step": 108 }, { "completion_length": 680.5, "epoch": 0.3811188811188811, "grad_norm": 0.5091893076896667, "kl": 0.06495320051908493, "learning_rate": 4.998766400914329e-06, "loss": 0.0026, "reward": 5.424283981323242, "reward_std": 3.0205914974212646, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2242841124534607, "rewards/reward_search_strategy": 0.824999988079071, "step": 109 }, { "completion_length": 546.875, "epoch": 0.38461538461538464, "grad_norm": 0.7380320429801941, "kl": 0.080781489610672, "learning_rate": 4.99847706754774e-06, "loss": 0.0032, "reward": 3.1829848289489746, "reward_std": 1.9441657066345215, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.45798495411872864, "rewards/reward_search_strategy": 0.22500000894069672, "step": 110 }, { "completion_length": 696.125, "epoch": 0.3881118881118881, "grad_norm": 0.7519396543502808, "kl": 0.0903296023607254, "learning_rate": 4.998157291026553e-06, "loss": 0.0036, "reward": 3.140177011489868, "reward_std": 2.6969096660614014, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4151769280433655, "rewards/reward_search_strategy": 0.3500000238418579, "step": 111 }, { "completion_length": 530.5, "epoch": 0.3916083916083916, "grad_norm": 1.3212255239486694, "kl": 0.17164835333824158, "learning_rate": 4.997807075247147e-06, "loss": 0.0069, "reward": 2.3374075889587402, "reward_std": 1.7672338485717773, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.41240769624710083, "rewards/reward_search_strategy": 0.30000001192092896, "step": 112 }, { "completion_length": 703.375, "epoch": 0.3951048951048951, "grad_norm": 0.48862800002098083, "kl": 0.04361271113157272, "learning_rate": 4.997426424476787e-06, "loss": 0.0017, "reward": 4.811786651611328, "reward_std": 3.4794275760650635, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.48678621649742126, "rewards/reward_search_strategy": 0.699999988079071, "step": 113 }, { "completion_length": 666.375, "epoch": 0.3986013986013986, "grad_norm": 0.7718109488487244, "kl": 0.040162038058042526, "learning_rate": 4.9970153433535855e-06, "loss": 0.0016, "reward": 3.659369468688965, "reward_std": 2.0675926208496094, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4593694806098938, "rewards/reward_search_strategy": 0.5750000476837158, "step": 114 }, { "completion_length": 640.625, "epoch": 0.4020979020979021, "grad_norm": 0.5138198137283325, "kl": 0.05123981833457947, "learning_rate": 4.9965738368864345e-06, "loss": 0.002, "reward": 2.629551410675049, "reward_std": 1.7166107892990112, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5045512914657593, "rewards/reward_search_strategy": 0.5, "step": 115 }, { "completion_length": 586.75, "epoch": 0.40559440559440557, "grad_norm": 1.4575413465499878, "kl": 0.08412657678127289, "learning_rate": 4.996101910454953e-06, "loss": 0.0034, "reward": 4.0647077560424805, "reward_std": 1.9030133485794067, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4147075116634369, "rewards/reward_search_strategy": 0.40000003576278687, "step": 116 }, { "completion_length": 514.625, "epoch": 0.4090909090909091, "grad_norm": 1.051775574684143, "kl": 0.09374314546585083, "learning_rate": 4.995599569809414e-06, "loss": 0.0037, "reward": 2.5817337036132812, "reward_std": 2.1470091342926025, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30673372745513916, "rewards/reward_search_strategy": 0.2750000059604645, "step": 117 }, { "completion_length": 814.375, "epoch": 0.4125874125874126, "grad_norm": 0.4415126144886017, "kl": 0.07567695528268814, "learning_rate": 4.9950668210706795e-06, "loss": 0.003, "reward": 2.189971685409546, "reward_std": 1.4690890312194824, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5649716854095459, "rewards/reward_search_strategy": 0.375, "step": 118 }, { "completion_length": 550.375, "epoch": 0.4160839160839161, "grad_norm": 0.50792396068573, "kl": 0.05492626503109932, "learning_rate": 4.994503670730126e-06, "loss": 0.0022, "reward": 4.787566661834717, "reward_std": 2.5005829334259033, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6875668168067932, "rewards/reward_search_strategy": 0.6000000238418579, "step": 119 }, { "completion_length": 609.0, "epoch": 0.4195804195804196, "grad_norm": 0.6223260760307312, "kl": 0.047494076192379, "learning_rate": 4.993910125649561e-06, "loss": 0.0019, "reward": 4.249635696411133, "reward_std": 2.9354586601257324, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3746356964111328, "rewards/reward_search_strategy": 0.3750000298023224, "step": 120 }, { "completion_length": 560.875, "epoch": 0.4230769230769231, "grad_norm": 0.9563676118850708, "kl": 0.09799660742282867, "learning_rate": 4.993286193061145e-06, "loss": 0.0039, "reward": 2.565558910369873, "reward_std": 2.178126811981201, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3405587375164032, "rewards/reward_search_strategy": 0.3500000238418579, "step": 121 }, { "completion_length": 387.625, "epoch": 0.42657342657342656, "grad_norm": 0.7505446076393127, "kl": 0.14984500408172607, "learning_rate": 4.992631880567301e-06, "loss": 0.006, "reward": 1.1751203536987305, "reward_std": 1.6967118978500366, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.20012035965919495, "rewards/reward_search_strategy": 0.22500000894069672, "step": 122 }, { "completion_length": 411.875, "epoch": 0.43006993006993005, "grad_norm": 1.1681523323059082, "kl": 0.0963008776307106, "learning_rate": 4.991947196140619e-06, "loss": 0.0039, "reward": 3.0484721660614014, "reward_std": 3.416553497314453, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3984720706939697, "rewards/reward_search_strategy": 0.4000000059604645, "step": 123 }, { "completion_length": 423.75, "epoch": 0.43356643356643354, "grad_norm": 0.9286041259765625, "kl": 0.07872038334608078, "learning_rate": 4.9912321481237616e-06, "loss": 0.0031, "reward": 3.2697560787200928, "reward_std": 3.4370007514953613, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3947560787200928, "rewards/reward_search_strategy": 0.5, "step": 124 }, { "completion_length": 463.625, "epoch": 0.4370629370629371, "grad_norm": 1.0566741228103638, "kl": 0.0985063686966896, "learning_rate": 4.990486745229364e-06, "loss": 0.0039, "reward": 1.5522143840789795, "reward_std": 1.579607605934143, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.37721437215805054, "rewards/reward_search_strategy": 0.17500001192092896, "step": 125 }, { "completion_length": 685.625, "epoch": 0.4405594405594406, "grad_norm": 0.6656001806259155, "kl": 0.06633864343166351, "learning_rate": 4.989710996539926e-06, "loss": 0.0027, "reward": 2.43574857711792, "reward_std": 2.3410024642944336, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.585748553276062, "rewards/reward_search_strategy": 0.3500000238418579, "step": 126 }, { "completion_length": 637.375, "epoch": 0.44405594405594406, "grad_norm": 0.7161982655525208, "kl": 0.07665619254112244, "learning_rate": 4.9889049115077e-06, "loss": 0.0031, "reward": 3.5046868324279785, "reward_std": 2.549543619155884, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5796867609024048, "rewards/reward_search_strategy": 0.550000011920929, "step": 127 }, { "completion_length": 644.375, "epoch": 0.44755244755244755, "grad_norm": 0.5414532423019409, "kl": 0.06415767967700958, "learning_rate": 4.988068499954578e-06, "loss": 0.0026, "reward": 5.926258563995361, "reward_std": 2.7303335666656494, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6762584447860718, "rewards/reward_search_strategy": 0.625, "step": 128 }, { "completion_length": 397.25, "epoch": 0.45104895104895104, "grad_norm": 0.8846190571784973, "kl": 0.14254269003868103, "learning_rate": 4.987201772071971e-06, "loss": 0.0057, "reward": 2.7677345275878906, "reward_std": 2.424544095993042, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5427343845367432, "rewards/reward_search_strategy": 0.4750000238418579, "step": 129 }, { "completion_length": 463.875, "epoch": 0.45454545454545453, "grad_norm": 0.6526610851287842, "kl": 0.12603093683719635, "learning_rate": 4.986304738420684e-06, "loss": 0.005, "reward": 5.665769577026367, "reward_std": 2.0002214908599854, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1657698005437851, "rewards/reward_search_strategy": 0.5, "step": 130 }, { "completion_length": 425.125, "epoch": 0.458041958041958, "grad_norm": 1.6641560792922974, "kl": 0.2266431599855423, "learning_rate": 4.985377409930789e-06, "loss": 0.0091, "reward": 4.595471382141113, "reward_std": 3.3347935676574707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.42047110199928284, "rewards/reward_search_strategy": 0.550000011920929, "step": 131 }, { "completion_length": 708.75, "epoch": 0.46153846153846156, "grad_norm": 0.49309638142585754, "kl": 0.07724378257989883, "learning_rate": 4.984419797901491e-06, "loss": 0.0031, "reward": 2.9709227085113525, "reward_std": 0.63145512342453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4459228515625, "rewards/reward_search_strategy": 0.5249999761581421, "step": 132 }, { "completion_length": 570.75, "epoch": 0.46503496503496505, "grad_norm": 0.8325252532958984, "kl": 0.15752217173576355, "learning_rate": 4.983431914000991e-06, "loss": 0.0063, "reward": 2.9882287979125977, "reward_std": 2.527655601501465, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.463228702545166, "rewards/reward_search_strategy": 0.5250000357627869, "step": 133 }, { "completion_length": 669.625, "epoch": 0.46853146853146854, "grad_norm": 1.057653784751892, "kl": 0.10220281034708023, "learning_rate": 4.9824137702663424e-06, "loss": 0.0041, "reward": 2.843283176422119, "reward_std": 2.226605176925659, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3932831287384033, "rewards/reward_search_strategy": 0.45000001788139343, "step": 134 }, { "completion_length": 387.25, "epoch": 0.47202797202797203, "grad_norm": 0.7793525457382202, "kl": 0.09381990879774094, "learning_rate": 4.981365379103306e-06, "loss": 0.0038, "reward": 3.3774726390838623, "reward_std": 2.2550406455993652, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37747257947921753, "rewards/reward_search_strategy": 0.5, "step": 135 }, { "completion_length": 454.625, "epoch": 0.4755244755244755, "grad_norm": 0.6707426309585571, "kl": 0.09792742133140564, "learning_rate": 4.980286753286196e-06, "loss": 0.0039, "reward": 5.85577392578125, "reward_std": 2.534198522567749, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5807737112045288, "rewards/reward_search_strategy": 0.6499999761581421, "step": 136 }, { "completion_length": 513.0, "epoch": 0.479020979020979, "grad_norm": 0.91905677318573, "kl": 0.12576593458652496, "learning_rate": 4.979177905957726e-06, "loss": 0.005, "reward": 2.693246603012085, "reward_std": 1.9298425912857056, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.44324639439582825, "rewards/reward_search_strategy": 0.25, "step": 137 }, { "completion_length": 535.0, "epoch": 0.4825174825174825, "grad_norm": 0.6421681642532349, "kl": 0.16189059615135193, "learning_rate": 4.978038850628855e-06, "loss": 0.0065, "reward": 2.3643674850463867, "reward_std": 2.4537479877471924, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.614367663860321, "rewards/reward_search_strategy": 0.5, "step": 138 }, { "completion_length": 388.625, "epoch": 0.486013986013986, "grad_norm": 1.6358879804611206, "kl": 0.31070151925086975, "learning_rate": 4.9768696011786095e-06, "loss": 0.0124, "reward": 7.369345664978027, "reward_std": 2.027414560317993, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.11934606730937958, "rewards/reward_search_strategy": 0.875, "step": 139 }, { "completion_length": 430.375, "epoch": 0.48951048951048953, "grad_norm": 1.3875221014022827, "kl": 0.1299194097518921, "learning_rate": 4.975670171853926e-06, "loss": 0.0052, "reward": 4.600214958190918, "reward_std": 3.4139461517333984, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4752153754234314, "rewards/reward_search_strategy": 0.625, "step": 140 }, { "completion_length": 619.375, "epoch": 0.493006993006993, "grad_norm": 1.2012630701065063, "kl": 0.07434721291065216, "learning_rate": 4.974440577269473e-06, "loss": 0.003, "reward": 3.8403759002685547, "reward_std": 2.251944065093994, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.34037598967552185, "rewards/reward_search_strategy": 0.625, "step": 141 }, { "completion_length": 662.0, "epoch": 0.4965034965034965, "grad_norm": 0.4479227066040039, "kl": 0.08045605570077896, "learning_rate": 4.973180832407471e-06, "loss": 0.0032, "reward": 4.356062889099121, "reward_std": 1.873792052268982, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6060628294944763, "rewards/reward_search_strategy": 0.5, "step": 142 }, { "completion_length": 408.0, "epoch": 0.5, "grad_norm": 0.774638831615448, "kl": 0.16293062269687653, "learning_rate": 4.971890952617515e-06, "loss": 0.0065, "reward": 3.544522762298584, "reward_std": 3.194159507751465, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.31952300667762756, "rewards/reward_search_strategy": 0.3500000238418579, "step": 143 }, { "completion_length": 466.75, "epoch": 0.5034965034965035, "grad_norm": 0.8625693917274475, "kl": 0.10907953977584839, "learning_rate": 4.970570953616383e-06, "loss": 0.0044, "reward": 3.3587875366210938, "reward_std": 2.7467174530029297, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3837875723838806, "rewards/reward_search_strategy": 0.3500000238418579, "step": 144 }, { "completion_length": 856.125, "epoch": 0.506993006993007, "grad_norm": 0.37043091654777527, "kl": 0.05690326541662216, "learning_rate": 4.9692208514878445e-06, "loss": 0.0023, "reward": 5.424896240234375, "reward_std": 2.465031862258911, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49989593029022217, "rewards/reward_search_strategy": 0.7999999523162842, "step": 145 }, { "completion_length": 557.25, "epoch": 0.5104895104895105, "grad_norm": 0.8445433378219604, "kl": 0.06749890744686127, "learning_rate": 4.96784066268247e-06, "loss": 0.0027, "reward": 2.7921762466430664, "reward_std": 1.2953386306762695, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5421763062477112, "rewards/reward_search_strategy": 0.625, "step": 146 }, { "completion_length": 447.5, "epoch": 0.513986013986014, "grad_norm": 0.9278783798217773, "kl": 0.10697238147258759, "learning_rate": 4.966430404017424e-06, "loss": 0.0043, "reward": 3.9682509899139404, "reward_std": 3.156663417816162, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.41825127601623535, "rewards/reward_search_strategy": 0.42500001192092896, "step": 147 }, { "completion_length": 379.625, "epoch": 0.5174825174825175, "grad_norm": 1.5938533544540405, "kl": 0.11513354629278183, "learning_rate": 4.964990092676263e-06, "loss": 0.0046, "reward": 3.4113523960113525, "reward_std": 2.4409544467926025, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2863525450229645, "rewards/reward_search_strategy": 0.5, "step": 148 }, { "completion_length": 411.875, "epoch": 0.5209790209790209, "grad_norm": 1.070616364479065, "kl": 0.2734951078891754, "learning_rate": 4.963519746208726e-06, "loss": 0.0109, "reward": 2.4380977153778076, "reward_std": 2.1429409980773926, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.363097608089447, "rewards/reward_search_strategy": 0.20000000298023224, "step": 149 }, { "completion_length": 355.75, "epoch": 0.5244755244755245, "grad_norm": 0.9825189113616943, "kl": 0.14074762165546417, "learning_rate": 4.962019382530521e-06, "loss": 0.0056, "reward": 6.416107177734375, "reward_std": 3.2488479614257812, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6411075592041016, "rewards/reward_search_strategy": 0.7749999761581421, "step": 150 }, { "completion_length": 642.5, "epoch": 0.527972027972028, "grad_norm": 0.9618121385574341, "kl": 0.07677344232797623, "learning_rate": 4.960489019923105e-06, "loss": 0.0031, "reward": 2.5563745498657227, "reward_std": 1.2503511905670166, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4313744604587555, "rewards/reward_search_strategy": 0.625, "step": 151 }, { "completion_length": 513.375, "epoch": 0.5314685314685315, "grad_norm": 1.855919599533081, "kl": 0.21450263261795044, "learning_rate": 4.958928677033465e-06, "loss": 0.0086, "reward": 3.358847141265869, "reward_std": 1.8732644319534302, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.28384697437286377, "rewards/reward_search_strategy": 0.5750000476837158, "step": 152 }, { "completion_length": 478.375, "epoch": 0.534965034965035, "grad_norm": 0.5649963021278381, "kl": 0.059622377157211304, "learning_rate": 4.957338372873886e-06, "loss": 0.0024, "reward": 3.669475793838501, "reward_std": 3.19222354888916, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4944758117198944, "rewards/reward_search_strategy": 0.550000011920929, "step": 153 }, { "completion_length": 233.25, "epoch": 0.5384615384615384, "grad_norm": 1.1332588195800781, "kl": 0.09504832327365875, "learning_rate": 4.9557181268217225e-06, "loss": 0.0038, "reward": 3.0735678672790527, "reward_std": 2.9650135040283203, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.22356799244880676, "rewards/reward_search_strategy": 0.10000000149011612, "step": 154 }, { "completion_length": 365.75, "epoch": 0.541958041958042, "grad_norm": 0.7829000949859619, "kl": 0.12874586880207062, "learning_rate": 4.9540679586191605e-06, "loss": 0.0051, "reward": 3.2776992321014404, "reward_std": 2.2649874687194824, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4526992440223694, "rewards/reward_search_strategy": 0.44999998807907104, "step": 155 }, { "completion_length": 287.875, "epoch": 0.5454545454545454, "grad_norm": 1.0810236930847168, "kl": 0.17965207993984222, "learning_rate": 4.9523878883729794e-06, "loss": 0.0072, "reward": 5.42661714553833, "reward_std": 3.3766465187072754, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40161705017089844, "rewards/reward_search_strategy": 0.6500000357627869, "step": 156 }, { "completion_length": 562.375, "epoch": 0.548951048951049, "grad_norm": 0.629564642906189, "kl": 0.047585126012563705, "learning_rate": 4.9506779365543054e-06, "loss": 0.0019, "reward": 3.9784889221191406, "reward_std": 0.8592731356620789, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.628489077091217, "rewards/reward_search_strategy": 0.4749999940395355, "step": 157 }, { "completion_length": 333.125, "epoch": 0.5524475524475524, "grad_norm": 0.8598470091819763, "kl": 0.23344238102436066, "learning_rate": 4.94893812399836e-06, "loss": 0.0093, "reward": 5.325415134429932, "reward_std": 2.6435444355010986, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3754151463508606, "rewards/reward_search_strategy": 0.44999998807907104, "step": 158 }, { "completion_length": 369.0, "epoch": 0.5559440559440559, "grad_norm": 1.518742322921753, "kl": 0.1705542653799057, "learning_rate": 4.947168471904213e-06, "loss": 0.0068, "reward": 5.637444972991943, "reward_std": 3.7453742027282715, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.41244518756866455, "rewards/reward_search_strategy": 0.6000000238418579, "step": 159 }, { "completion_length": 409.625, "epoch": 0.5594405594405595, "grad_norm": 1.2653708457946777, "kl": 0.2512824833393097, "learning_rate": 4.9453690018345144e-06, "loss": 0.0101, "reward": 1.8753092288970947, "reward_std": 1.6477832794189453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.32530921697616577, "rewards/reward_search_strategy": 0.30000001192092896, "step": 160 }, { "completion_length": 766.375, "epoch": 0.5629370629370629, "grad_norm": 0.4625481367111206, "kl": 0.056169163435697556, "learning_rate": 4.9435397357152406e-06, "loss": 0.0022, "reward": 3.3957467079162598, "reward_std": 1.1029468774795532, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5957465767860413, "rewards/reward_search_strategy": 0.550000011920929, "step": 161 }, { "completion_length": 749.875, "epoch": 0.5664335664335665, "grad_norm": 0.6248977184295654, "kl": 0.07820535451173782, "learning_rate": 4.9416806958354206e-06, "loss": 0.0031, "reward": 4.84914493560791, "reward_std": 3.21990704536438, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5491449236869812, "rewards/reward_search_strategy": 0.7999999523162842, "step": 162 }, { "completion_length": 318.375, "epoch": 0.5699300699300699, "grad_norm": 2.4732108116149902, "kl": 0.2569376230239868, "learning_rate": 4.939791904846869e-06, "loss": 0.0103, "reward": 5.542656898498535, "reward_std": 3.050025224685669, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.39265698194503784, "rewards/reward_search_strategy": 0.4000000059604645, "step": 163 }, { "completion_length": 638.5, "epoch": 0.5734265734265734, "grad_norm": 0.5316995978355408, "kl": 0.06073322519659996, "learning_rate": 4.937873385763909e-06, "loss": 0.0024, "reward": 3.4646921157836914, "reward_std": 0.9867516160011292, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.614692211151123, "rewards/reward_search_strategy": 0.4750000238418579, "step": 164 }, { "completion_length": 543.0, "epoch": 0.5769230769230769, "grad_norm": 8.116938591003418, "kl": 4.153862953186035, "learning_rate": 4.935925161963089e-06, "loss": 0.1662, "reward": 4.612438678741455, "reward_std": 3.483085870742798, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5374384522438049, "rewards/reward_search_strategy": 0.699999988079071, "step": 165 }, { "completion_length": 378.5, "epoch": 0.5804195804195804, "grad_norm": 0.9611401557922363, "kl": 0.18193311989307404, "learning_rate": 4.933947257182901e-06, "loss": 0.0073, "reward": 4.473653316497803, "reward_std": 3.3241894245147705, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.19865311682224274, "rewards/reward_search_strategy": 0.7749999761581421, "step": 166 }, { "completion_length": 465.625, "epoch": 0.583916083916084, "grad_norm": 0.7452073097229004, "kl": 0.09764686226844788, "learning_rate": 4.9319396955234925e-06, "loss": 0.0039, "reward": 4.452567100524902, "reward_std": 3.8106887340545654, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4025672674179077, "rewards/reward_search_strategy": 0.550000011920929, "step": 167 }, { "completion_length": 613.75, "epoch": 0.5874125874125874, "grad_norm": 0.5454308390617371, "kl": 0.057498060166835785, "learning_rate": 4.9299025014463665e-06, "loss": 0.0023, "reward": 2.4831161499023438, "reward_std": 0.9839221239089966, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.43311628699302673, "rewards/reward_search_strategy": 0.550000011920929, "step": 168 }, { "completion_length": 654.25, "epoch": 0.5909090909090909, "grad_norm": 0.566487193107605, "kl": 0.04927007481455803, "learning_rate": 4.92783569977409e-06, "loss": 0.002, "reward": 3.5991201400756836, "reward_std": 2.8235316276550293, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.22412018477916718, "rewards/reward_search_strategy": 0.625, "step": 169 }, { "completion_length": 295.625, "epoch": 0.5944055944055944, "grad_norm": 1.1766833066940308, "kl": 0.24150727689266205, "learning_rate": 4.925739315689991e-06, "loss": 0.0097, "reward": 2.3266749382019043, "reward_std": 3.2796859741210938, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.27667519450187683, "rewards/reward_search_strategy": 0.30000001192092896, "step": 170 }, { "completion_length": 527.125, "epoch": 0.5979020979020979, "grad_norm": 0.6295068264007568, "kl": 0.08597421646118164, "learning_rate": 4.923613374737848e-06, "loss": 0.0034, "reward": 3.203831672668457, "reward_std": 2.0840423107147217, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6288318037986755, "rewards/reward_search_strategy": 0.574999988079071, "step": 171 }, { "completion_length": 423.5, "epoch": 0.6013986013986014, "grad_norm": 0.7889199256896973, "kl": 0.09207924455404282, "learning_rate": 4.921457902821578e-06, "loss": 0.0037, "reward": 3.5908405780792236, "reward_std": 2.698184013366699, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3158404231071472, "rewards/reward_search_strategy": 0.40000003576278687, "step": 172 }, { "completion_length": 515.875, "epoch": 0.6048951048951049, "grad_norm": 1.1472229957580566, "kl": 0.08138255774974823, "learning_rate": 4.9192729262049285e-06, "loss": 0.0033, "reward": 2.5571367740631104, "reward_std": 2.301767349243164, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30713677406311035, "rewards/reward_search_strategy": 0.25, "step": 173 }, { "completion_length": 380.5, "epoch": 0.6083916083916084, "grad_norm": 1.0713775157928467, "kl": 0.08360132575035095, "learning_rate": 4.917058471511149e-06, "loss": 0.0033, "reward": 2.460089683532715, "reward_std": 2.595458507537842, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.4350895881652832, "rewards/reward_search_strategy": 0.2750000059604645, "step": 174 }, { "completion_length": 529.25, "epoch": 0.6118881118881119, "grad_norm": 0.463553249835968, "kl": 0.05024154111742973, "learning_rate": 4.914814565722671e-06, "loss": 0.002, "reward": 3.3107078075408936, "reward_std": 2.1608943939208984, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48570770025253296, "rewards/reward_search_strategy": 0.5750000476837158, "step": 175 }, { "completion_length": 482.75, "epoch": 0.6153846153846154, "grad_norm": 1.2290676832199097, "kl": 0.14474555850028992, "learning_rate": 4.912541236180779e-06, "loss": 0.0058, "reward": 3.2196455001831055, "reward_std": 2.8522043228149414, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2946455180644989, "rewards/reward_search_strategy": 0.42500001192092896, "step": 176 }, { "completion_length": 423.75, "epoch": 0.6188811188811189, "grad_norm": 1.102494716644287, "kl": 0.059243083000183105, "learning_rate": 4.910238510585275e-06, "loss": 0.0024, "reward": 2.8200690746307373, "reward_std": 2.557361364364624, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39506906270980835, "rewards/reward_search_strategy": 0.42500001192092896, "step": 177 }, { "completion_length": 502.125, "epoch": 0.6223776223776224, "grad_norm": 1.207655668258667, "kl": 0.11078634113073349, "learning_rate": 4.907906416994146e-06, "loss": 0.0044, "reward": 4.589856147766113, "reward_std": 2.6245768070220947, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48985564708709717, "rewards/reward_search_strategy": 0.6000000238418579, "step": 178 }, { "completion_length": 508.875, "epoch": 0.6258741258741258, "grad_norm": 1.0890285968780518, "kl": 0.08423227816820145, "learning_rate": 4.905544983823214e-06, "loss": 0.0034, "reward": 2.421203374862671, "reward_std": 1.544132113456726, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.44620347023010254, "rewards/reward_search_strategy": 0.3500000238418579, "step": 179 }, { "completion_length": 332.875, "epoch": 0.6293706293706294, "grad_norm": 1.8258018493652344, "kl": 0.15480613708496094, "learning_rate": 4.903154239845798e-06, "loss": 0.0062, "reward": 1.375791311264038, "reward_std": 1.4795390367507935, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3257913589477539, "rewards/reward_search_strategy": 0.30000001192092896, "step": 180 }, { "completion_length": 312.25, "epoch": 0.6328671328671329, "grad_norm": 6.885568618774414, "kl": 0.5282591581344604, "learning_rate": 4.900734214192358e-06, "loss": 0.0211, "reward": 4.6441650390625, "reward_std": 3.3753483295440674, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5191651582717896, "rewards/reward_search_strategy": 0.5, "step": 181 }, { "completion_length": 442.375, "epoch": 0.6363636363636364, "grad_norm": 0.612838864326477, "kl": 0.08585107326507568, "learning_rate": 4.898284936350144e-06, "loss": 0.0034, "reward": 4.933065891265869, "reward_std": 3.642153739929199, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.7080655694007874, "rewards/reward_search_strategy": 0.4749999940395355, "step": 182 }, { "completion_length": 244.25, "epoch": 0.6398601398601399, "grad_norm": 1.8637006282806396, "kl": 0.38500577211380005, "learning_rate": 4.8958064361628334e-06, "loss": 0.0154, "reward": 6.256934642791748, "reward_std": 1.2352032661437988, "rewards/reward_correctness": 1.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4569346308708191, "rewards/reward_search_strategy": 0.30000001192092896, "step": 183 }, { "completion_length": 373.5, "epoch": 0.6433566433566433, "grad_norm": 1.1437357664108276, "kl": 0.12481357157230377, "learning_rate": 4.893298743830168e-06, "loss": 0.005, "reward": 3.356623411178589, "reward_std": 2.37040638923645, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5566233992576599, "rewards/reward_search_strategy": 0.30000001192092896, "step": 184 }, { "completion_length": 549.125, "epoch": 0.6468531468531469, "grad_norm": 0.6662846207618713, "kl": 0.10428506135940552, "learning_rate": 4.890761889907589e-06, "loss": 0.0042, "reward": 6.009631633758545, "reward_std": 3.1016297340393066, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7346314191818237, "rewards/reward_search_strategy": 0.6499999761581421, "step": 185 }, { "completion_length": 549.0, "epoch": 0.6503496503496503, "grad_norm": 0.7750718593597412, "kl": 0.09990722686052322, "learning_rate": 4.888195905305859e-06, "loss": 0.004, "reward": 4.673529624938965, "reward_std": 2.434964656829834, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5485298037528992, "rewards/reward_search_strategy": 0.625, "step": 186 }, { "completion_length": 531.625, "epoch": 0.6538461538461539, "grad_norm": 0.8020265102386475, "kl": 0.09699496626853943, "learning_rate": 4.885600821290692e-06, "loss": 0.0039, "reward": 3.3051586151123047, "reward_std": 1.4629502296447754, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4051586389541626, "rewards/reward_search_strategy": 0.4000000059604645, "step": 187 }, { "completion_length": 522.875, "epoch": 0.6573426573426573, "grad_norm": 1.3859021663665771, "kl": 0.13135036826133728, "learning_rate": 4.882976669482368e-06, "loss": 0.0053, "reward": 2.6884827613830566, "reward_std": 2.2438197135925293, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.388482928276062, "rewards/reward_search_strategy": 0.30000001192092896, "step": 188 }, { "completion_length": 518.5, "epoch": 0.6608391608391608, "grad_norm": 9.410249710083008, "kl": 0.6956667900085449, "learning_rate": 4.880323481855347e-06, "loss": 0.0278, "reward": 4.050230979919434, "reward_std": 3.249497652053833, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37523072957992554, "rewards/reward_search_strategy": 0.550000011920929, "step": 189 }, { "completion_length": 386.5, "epoch": 0.6643356643356644, "grad_norm": 0.8387676477432251, "kl": 0.08535090833902359, "learning_rate": 4.8776412907378845e-06, "loss": 0.0034, "reward": 4.588320255279541, "reward_std": 2.220287799835205, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.28832051157951355, "rewards/reward_search_strategy": 0.550000011920929, "step": 190 }, { "completion_length": 408.5, "epoch": 0.6678321678321678, "grad_norm": 2.006375551223755, "kl": 0.11828400194644928, "learning_rate": 4.874930128811631e-06, "loss": 0.0047, "reward": 3.0512070655822754, "reward_std": 2.421907424926758, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.30120688676834106, "rewards/reward_search_strategy": 0.5, "step": 191 }, { "completion_length": 245.75, "epoch": 0.6713286713286714, "grad_norm": 1.0472596883773804, "kl": 0.12134737521409988, "learning_rate": 4.8721900291112415e-06, "loss": 0.0049, "reward": 4.235280990600586, "reward_std": 3.679039716720581, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3602810502052307, "rewards/reward_search_strategy": 0.5, "step": 192 }, { "completion_length": 318.625, "epoch": 0.6748251748251748, "grad_norm": 1.2686798572540283, "kl": 0.17116394639015198, "learning_rate": 4.869421025023965e-06, "loss": 0.0068, "reward": 4.4517951011657715, "reward_std": 3.264738082885742, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5267948508262634, "rewards/reward_search_strategy": 0.42500001192092896, "step": 193 }, { "completion_length": 284.0, "epoch": 0.6783216783216783, "grad_norm": 1.0447121858596802, "kl": 0.2941432595252991, "learning_rate": 4.866623150289241e-06, "loss": 0.0118, "reward": 1.9646129608154297, "reward_std": 2.391249418258667, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.33961305022239685, "rewards/reward_search_strategy": 0.25, "step": 194 }, { "completion_length": 497.0, "epoch": 0.6818181818181818, "grad_norm": 0.604982852935791, "kl": 0.06880504637956619, "learning_rate": 4.863796438998293e-06, "loss": 0.0028, "reward": 6.05696964263916, "reward_std": 2.8059637546539307, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5319693684577942, "rewards/reward_search_strategy": 0.7749999761581421, "step": 195 }, { "completion_length": 322.625, "epoch": 0.6853146853146853, "grad_norm": 1.2860020399093628, "kl": 0.195574089884758, "learning_rate": 4.860940925593703e-06, "loss": 0.0078, "reward": 4.560000419616699, "reward_std": 2.813296318054199, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.31000030040740967, "rewards/reward_search_strategy": 0.375, "step": 196 }, { "completion_length": 536.75, "epoch": 0.6888111888111889, "grad_norm": 0.6069127321243286, "kl": 0.055065739899873734, "learning_rate": 4.858056644869002e-06, "loss": 0.0022, "reward": 4.489222049713135, "reward_std": 2.621090888977051, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5142220854759216, "rewards/reward_search_strategy": 0.6000000238418579, "step": 197 }, { "completion_length": 288.125, "epoch": 0.6923076923076923, "grad_norm": 1.4299031496047974, "kl": 0.28020644187927246, "learning_rate": 4.855143631968242e-06, "loss": 0.0112, "reward": 1.7020204067230225, "reward_std": 1.4690635204315186, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2770204544067383, "rewards/reward_search_strategy": 0.42500001192092896, "step": 198 }, { "completion_length": 451.625, "epoch": 0.6958041958041958, "grad_norm": 0.6012458205223083, "kl": 0.10064040124416351, "learning_rate": 4.852201922385564e-06, "loss": 0.004, "reward": 4.045350551605225, "reward_std": 2.6484973430633545, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5953505635261536, "rewards/reward_search_strategy": 0.5750000476837158, "step": 199 }, { "completion_length": 381.75, "epoch": 0.6993006993006993, "grad_norm": 1.712384819984436, "kl": 0.23387499153614044, "learning_rate": 4.849231551964771e-06, "loss": 0.0094, "reward": 6.612201690673828, "reward_std": 3.49477481842041, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4872019290924072, "rewards/reward_search_strategy": 0.75, "step": 200 }, { "completion_length": 489.0, "epoch": 0.7027972027972028, "grad_norm": 0.6931152939796448, "kl": 0.12770113348960876, "learning_rate": 4.84623255689889e-06, "loss": 0.0051, "reward": 4.282477378845215, "reward_std": 2.721886157989502, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5324774384498596, "rewards/reward_search_strategy": 0.5, "step": 201 }, { "completion_length": 308.75, "epoch": 0.7062937062937062, "grad_norm": 7.35214376449585, "kl": 2.9334309101104736, "learning_rate": 4.84320497372973e-06, "loss": 0.1173, "reward": 2.2045934200286865, "reward_std": 2.125786304473877, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4045933187007904, "rewards/reward_search_strategy": 0.42500001192092896, "step": 202 }, { "completion_length": 415.75, "epoch": 0.7097902097902098, "grad_norm": 0.819303035736084, "kl": 0.15163056552410126, "learning_rate": 4.840148839347434e-06, "loss": 0.0061, "reward": 5.433012962341309, "reward_std": 2.9335484504699707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7580130100250244, "rewards/reward_search_strategy": 0.675000011920929, "step": 203 }, { "completion_length": 259.625, "epoch": 0.7132867132867133, "grad_norm": 0.8853353261947632, "kl": 0.15020889043807983, "learning_rate": 4.837064190990036e-06, "loss": 0.006, "reward": 4.723485946655273, "reward_std": 3.2774343490600586, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.39848607778549194, "rewards/reward_search_strategy": 0.32499998807907104, "step": 204 }, { "completion_length": 263.75, "epoch": 0.7167832167832168, "grad_norm": 1.2585346698760986, "kl": 0.20474490523338318, "learning_rate": 4.833951066243004e-06, "loss": 0.0082, "reward": 4.4211530685424805, "reward_std": 3.1804075241088867, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.39615336060523987, "rewards/reward_search_strategy": 0.4000000059604645, "step": 205 }, { "completion_length": 335.5, "epoch": 0.7202797202797203, "grad_norm": 1.0650960206985474, "kl": 0.306348592042923, "learning_rate": 4.830809503038781e-06, "loss": 0.0123, "reward": 7.345704078674316, "reward_std": 1.786327838897705, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6957041621208191, "rewards/reward_search_strategy": 0.6499999761581421, "step": 206 }, { "completion_length": 380.125, "epoch": 0.7237762237762237, "grad_norm": 1.0272008180618286, "kl": 0.24504351615905762, "learning_rate": 4.8276395396563215e-06, "loss": 0.0098, "reward": 3.7895874977111816, "reward_std": 2.8140430450439453, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3645874857902527, "rewards/reward_search_strategy": 0.42500001192092896, "step": 207 }, { "completion_length": 452.75, "epoch": 0.7272727272727273, "grad_norm": 0.7511652112007141, "kl": 0.22063565254211426, "learning_rate": 4.824441214720629e-06, "loss": 0.0088, "reward": 4.4974045753479, "reward_std": 3.246708631515503, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3724047541618347, "rewards/reward_search_strategy": 0.625, "step": 208 }, { "completion_length": 259.5, "epoch": 0.7307692307692307, "grad_norm": 1.161224126815796, "kl": 0.4260641038417816, "learning_rate": 4.821214567202284e-06, "loss": 0.017, "reward": 5.784225940704346, "reward_std": 2.628154993057251, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.40922629833221436, "rewards/reward_search_strategy": 0.375, "step": 209 }, { "completion_length": 429.0, "epoch": 0.7342657342657343, "grad_norm": 1.5333750247955322, "kl": 0.3188992738723755, "learning_rate": 4.817959636416969e-06, "loss": 0.0128, "reward": 5.550121784210205, "reward_std": 3.3491060733795166, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4501221776008606, "rewards/reward_search_strategy": 0.7249999642372131, "step": 210 }, { "completion_length": 526.125, "epoch": 0.7377622377622378, "grad_norm": 0.5344757437705994, "kl": 0.07630512863397598, "learning_rate": 4.814676462024988e-06, "loss": 0.0031, "reward": 4.594638824462891, "reward_std": 2.2206883430480957, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5946387052536011, "rewards/reward_search_strategy": 0.625, "step": 211 }, { "completion_length": 362.75, "epoch": 0.7412587412587412, "grad_norm": 0.9454336762428284, "kl": 0.18136979639530182, "learning_rate": 4.811365084030784e-06, "loss": 0.0073, "reward": 2.3874056339263916, "reward_std": 2.5450692176818848, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4374057650566101, "rewards/reward_search_strategy": 0.44999998807907104, "step": 212 }, { "completion_length": 497.875, "epoch": 0.7447552447552448, "grad_norm": 0.5748926997184753, "kl": 0.0902019739151001, "learning_rate": 4.808025542782453e-06, "loss": 0.0036, "reward": 6.283139228820801, "reward_std": 3.231776237487793, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5081396102905273, "rewards/reward_search_strategy": 0.7749999761581421, "step": 213 }, { "completion_length": 498.375, "epoch": 0.7482517482517482, "grad_norm": 0.6200758218765259, "kl": 0.08524007350206375, "learning_rate": 4.804657878971252e-06, "loss": 0.0034, "reward": 3.962136745452881, "reward_std": 2.2499866485595703, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3621365427970886, "rewards/reward_search_strategy": 0.22500000894069672, "step": 214 }, { "completion_length": 382.75, "epoch": 0.7517482517482518, "grad_norm": 0.7669521570205688, "kl": 0.10141566395759583, "learning_rate": 4.801262133631101e-06, "loss": 0.0041, "reward": 4.497635841369629, "reward_std": 3.2545547485351562, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6476355791091919, "rewards/reward_search_strategy": 0.4750000238418579, "step": 215 }, { "completion_length": 465.0, "epoch": 0.7552447552447552, "grad_norm": 0.8276023268699646, "kl": 0.13839085400104523, "learning_rate": 4.7978383481380865e-06, "loss": 0.0055, "reward": 6.246551513671875, "reward_std": 2.7982473373413086, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4465520679950714, "rewards/reward_search_strategy": 0.550000011920929, "step": 216 }, { "completion_length": 348.875, "epoch": 0.7587412587412588, "grad_norm": 1.2766557931900024, "kl": 0.1600530445575714, "learning_rate": 4.794386564209953e-06, "loss": 0.0064, "reward": 3.271803855895996, "reward_std": 3.5930278301239014, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.37180379033088684, "rewards/reward_search_strategy": 0.4000000059604645, "step": 217 }, { "completion_length": 374.0, "epoch": 0.7622377622377622, "grad_norm": 0.9958794116973877, "kl": 0.15608493983745575, "learning_rate": 4.790906823905599e-06, "loss": 0.0062, "reward": 5.452359199523926, "reward_std": 2.1846044063568115, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6023593544960022, "rewards/reward_search_strategy": 0.3500000238418579, "step": 218 }, { "completion_length": 352.625, "epoch": 0.7657342657342657, "grad_norm": 0.7433205842971802, "kl": 0.08911055326461792, "learning_rate": 4.787399169624562e-06, "loss": 0.0036, "reward": 5.496169090270996, "reward_std": 3.098224639892578, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37116897106170654, "rewards/reward_search_strategy": 0.5, "step": 219 }, { "completion_length": 314.375, "epoch": 0.7692307692307693, "grad_norm": 1.3895384073257446, "kl": 0.14789187908172607, "learning_rate": 4.783863644106502e-06, "loss": 0.0059, "reward": 5.104283332824707, "reward_std": 3.202737808227539, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.42928358912467957, "rewards/reward_search_strategy": 0.550000011920929, "step": 220 }, { "completion_length": 483.0, "epoch": 0.7727272727272727, "grad_norm": 0.9903397560119629, "kl": 0.11351314932107925, "learning_rate": 4.780300290430683e-06, "loss": 0.0045, "reward": 3.8369758129119873, "reward_std": 2.3063299655914307, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4869758188724518, "rewards/reward_search_strategy": 0.6000000238418579, "step": 221 }, { "completion_length": 650.375, "epoch": 0.7762237762237763, "grad_norm": 1.8814777135849, "kl": 0.08025513589382172, "learning_rate": 4.776709152015443e-06, "loss": 0.0032, "reward": 3.4345803260803223, "reward_std": 2.4340741634368896, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.43458035588264465, "rewards/reward_search_strategy": 0.5, "step": 222 }, { "completion_length": 337.625, "epoch": 0.7797202797202797, "grad_norm": 1.796839952468872, "kl": 0.1309894621372223, "learning_rate": 4.773090272617672e-06, "loss": 0.0052, "reward": 4.828001976013184, "reward_std": 3.078373908996582, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5030019283294678, "rewards/reward_search_strategy": 0.45000001788139343, "step": 223 }, { "completion_length": 257.75, "epoch": 0.7832167832167832, "grad_norm": 1.1037311553955078, "kl": 0.0957639217376709, "learning_rate": 4.769443696332272e-06, "loss": 0.0038, "reward": 1.960368037223816, "reward_std": 2.3306467533111572, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.33536797761917114, "rewards/reward_search_strategy": 0.25, "step": 224 }, { "completion_length": 448.125, "epoch": 0.7867132867132867, "grad_norm": 0.759524405002594, "kl": 0.10344026237726212, "learning_rate": 4.765769467591626e-06, "loss": 0.0041, "reward": 6.857405662536621, "reward_std": 2.5283827781677246, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6074060201644897, "rewards/reward_search_strategy": 0.75, "step": 225 }, { "completion_length": 506.625, "epoch": 0.7902097902097902, "grad_norm": 1.952030062675476, "kl": 0.19945436716079712, "learning_rate": 4.762067631165049e-06, "loss": 0.008, "reward": 2.5767011642456055, "reward_std": 1.8916816711425781, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5267009735107422, "rewards/reward_search_strategy": 0.42500001192092896, "step": 226 }, { "completion_length": 805.5, "epoch": 0.7937062937062938, "grad_norm": 0.6654692888259888, "kl": 0.057463277131319046, "learning_rate": 4.7583382321582525e-06, "loss": 0.0023, "reward": 2.919649362564087, "reward_std": 2.193359375, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.44464927911758423, "rewards/reward_search_strategy": 0.4750000238418579, "step": 227 }, { "completion_length": 223.75, "epoch": 0.7972027972027972, "grad_norm": 1.6518536806106567, "kl": 0.19654996693134308, "learning_rate": 4.754581316012785e-06, "loss": 0.0079, "reward": 3.6179158687591553, "reward_std": 3.4085731506347656, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.26791587471961975, "rewards/reward_search_strategy": 0.3500000238418579, "step": 228 }, { "completion_length": 362.125, "epoch": 0.8006993006993007, "grad_norm": 1.1536964178085327, "kl": 0.10235659778118134, "learning_rate": 4.750796928505484e-06, "loss": 0.0041, "reward": 3.124476194381714, "reward_std": 2.4904367923736572, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.39947620034217834, "rewards/reward_search_strategy": 0.22500000894069672, "step": 229 }, { "completion_length": 428.25, "epoch": 0.8041958041958042, "grad_norm": 2.578925371170044, "kl": 0.25000184774398804, "learning_rate": 4.746985115747918e-06, "loss": 0.01, "reward": 4.44868278503418, "reward_std": 3.246079444885254, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.2986827492713928, "rewards/reward_search_strategy": 0.2750000059604645, "step": 230 }, { "completion_length": 518.625, "epoch": 0.8076923076923077, "grad_norm": 0.5841361284255981, "kl": 0.061389174312353134, "learning_rate": 4.743145924185821e-06, "loss": 0.0025, "reward": 4.760910511016846, "reward_std": 2.5773563385009766, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6859105229377747, "rewards/reward_search_strategy": 0.5750000476837158, "step": 231 }, { "completion_length": 446.0, "epoch": 0.8111888111888111, "grad_norm": 1.4337817430496216, "kl": 0.08883440494537354, "learning_rate": 4.7392794005985324e-06, "loss": 0.0036, "reward": 5.19779109954834, "reward_std": 2.8020260334014893, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4727913737297058, "rewards/reward_search_strategy": 0.5999999642372131, "step": 232 }, { "completion_length": 342.0, "epoch": 0.8146853146853147, "grad_norm": 9.370593070983887, "kl": 1.3528209924697876, "learning_rate": 4.735385592098421e-06, "loss": 0.0541, "reward": 3.8530240058898926, "reward_std": 3.4489855766296387, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3780239224433899, "rewards/reward_search_strategy": 0.3500000238418579, "step": 233 }, { "completion_length": 552.375, "epoch": 0.8181818181818182, "grad_norm": 0.5406525731086731, "kl": 0.06817281246185303, "learning_rate": 4.731464546130315e-06, "loss": 0.0027, "reward": 6.135942459106445, "reward_std": 2.168360471725464, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.48594266176223755, "rewards/reward_search_strategy": 0.6500000357627869, "step": 234 }, { "completion_length": 182.125, "epoch": 0.8216783216783217, "grad_norm": 1.1569007635116577, "kl": 0.17475339770317078, "learning_rate": 4.72751631047092e-06, "loss": 0.007, "reward": 3.322808265686035, "reward_std": 4.106693267822266, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2228086292743683, "rewards/reward_search_strategy": 0.3500000238418579, "step": 235 }, { "completion_length": 576.0, "epoch": 0.8251748251748252, "grad_norm": 0.7848924994468689, "kl": 0.09483487904071808, "learning_rate": 4.723540933228245e-06, "loss": 0.0038, "reward": 5.85687255859375, "reward_std": 3.493412733078003, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5068723559379578, "rewards/reward_search_strategy": 0.5999999642372131, "step": 236 }, { "completion_length": 435.125, "epoch": 0.8286713286713286, "grad_norm": 0.9283254146575928, "kl": 0.09187249094247818, "learning_rate": 4.719538462841003e-06, "loss": 0.0037, "reward": 3.7286319732666016, "reward_std": 1.7196377515792847, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.253632128238678, "rewards/reward_search_strategy": 0.3500000238418579, "step": 237 }, { "completion_length": 228.5, "epoch": 0.8321678321678322, "grad_norm": 1.0967156887054443, "kl": 0.11720684170722961, "learning_rate": 4.715508948078037e-06, "loss": 0.0047, "reward": 3.382150650024414, "reward_std": 3.5990474224090576, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.28215059638023376, "rewards/reward_search_strategy": 0.3500000238418579, "step": 238 }, { "completion_length": 315.0, "epoch": 0.8356643356643356, "grad_norm": 0.8198094367980957, "kl": 0.10767393559217453, "learning_rate": 4.71145243803771e-06, "loss": 0.0043, "reward": 3.447524070739746, "reward_std": 2.665365695953369, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4225241541862488, "rewards/reward_search_strategy": 0.2750000059604645, "step": 239 }, { "completion_length": 439.5, "epoch": 0.8391608391608392, "grad_norm": 0.6438246965408325, "kl": 0.0823763906955719, "learning_rate": 4.707368982147318e-06, "loss": 0.0033, "reward": 4.2677226066589355, "reward_std": 2.0768139362335205, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6177226901054382, "rewards/reward_search_strategy": 0.6499999761581421, "step": 240 }, { "completion_length": 452.625, "epoch": 0.8426573426573427, "grad_norm": 0.8286120295524597, "kl": 0.13944250345230103, "learning_rate": 4.703258630162481e-06, "loss": 0.0056, "reward": 6.840209007263184, "reward_std": 2.0674614906311035, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5402096509933472, "rewards/reward_search_strategy": 0.675000011920929, "step": 241 }, { "completion_length": 425.125, "epoch": 0.8461538461538461, "grad_norm": 0.6866943836212158, "kl": 0.08232449740171432, "learning_rate": 4.699121432166542e-06, "loss": 0.0033, "reward": 4.0894389152526855, "reward_std": 3.227137327194214, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.21443882584571838, "rewards/reward_search_strategy": 0.5, "step": 242 }, { "completion_length": 318.5, "epoch": 0.8496503496503497, "grad_norm": 1.1648399829864502, "kl": 0.19596882164478302, "learning_rate": 4.6949574385699514e-06, "loss": 0.0078, "reward": 3.9066293239593506, "reward_std": 3.0713882446289062, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3316292464733124, "rewards/reward_search_strategy": 0.44999998807907104, "step": 243 }, { "completion_length": 338.0, "epoch": 0.8531468531468531, "grad_norm": 45.41154479980469, "kl": 11.361538887023926, "learning_rate": 4.690766700109659e-06, "loss": 0.4545, "reward": 4.442761421203613, "reward_std": 3.3356175422668457, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29276183247566223, "rewards/reward_search_strategy": 0.3999999761581421, "step": 244 }, { "completion_length": 272.375, "epoch": 0.8566433566433567, "grad_norm": 0.975391149520874, "kl": 0.0915113240480423, "learning_rate": 4.68654926784849e-06, "loss": 0.0037, "reward": 5.3376922607421875, "reward_std": 2.7289388179779053, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6126920580863953, "rewards/reward_search_strategy": 0.7250000238418579, "step": 245 }, { "completion_length": 328.0, "epoch": 0.8601398601398601, "grad_norm": 0.8504089117050171, "kl": 0.09103550016880035, "learning_rate": 4.682305193174524e-06, "loss": 0.0036, "reward": 6.278281211853027, "reward_std": 3.407573699951172, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5032811164855957, "rewards/reward_search_strategy": 0.6499999761581421, "step": 246 }, { "completion_length": 503.375, "epoch": 0.8636363636363636, "grad_norm": 0.8787162899971008, "kl": 0.08089443296194077, "learning_rate": 4.6780345278004744e-06, "loss": 0.0032, "reward": 5.4692606925964355, "reward_std": 3.3828439712524414, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4442606270313263, "rewards/reward_search_strategy": 0.5249999761581421, "step": 247 }, { "completion_length": 545.25, "epoch": 0.8671328671328671, "grad_norm": 0.5359931588172913, "kl": 0.12629981338977814, "learning_rate": 4.673737323763048e-06, "loss": 0.0051, "reward": 4.020110607147217, "reward_std": 2.231464147567749, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6951106786727905, "rewards/reward_search_strategy": 0.7000000476837158, "step": 248 }, { "completion_length": 406.625, "epoch": 0.8706293706293706, "grad_norm": 5.756573677062988, "kl": 4.332620143890381, "learning_rate": 4.669413633422322e-06, "loss": 0.1733, "reward": 4.411682605743408, "reward_std": 2.8709969520568848, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4866824746131897, "rewards/reward_search_strategy": 0.675000011920929, "step": 249 }, { "completion_length": 659.625, "epoch": 0.8741258741258742, "grad_norm": 0.5628019571304321, "kl": 0.06190004199743271, "learning_rate": 4.665063509461098e-06, "loss": 0.0025, "reward": 5.015512466430664, "reward_std": 2.039670944213867, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6405123472213745, "rewards/reward_search_strategy": 0.625, "step": 250 }, { "completion_length": 376.0, "epoch": 0.8776223776223776, "grad_norm": 0.6614730358123779, "kl": 0.10508718341588974, "learning_rate": 4.6606870048842626e-06, "loss": 0.0042, "reward": 4.854657173156738, "reward_std": 3.0948827266693115, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4296573996543884, "rewards/reward_search_strategy": 0.550000011920929, "step": 251 }, { "completion_length": 352.5, "epoch": 0.8811188811188811, "grad_norm": 1.1718766689300537, "kl": 0.10988699644804001, "learning_rate": 4.656284173018144e-06, "loss": 0.0044, "reward": 4.657190322875977, "reward_std": 3.0112640857696533, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.40719035267829895, "rewards/reward_search_strategy": 0.5, "step": 252 }, { "completion_length": 596.875, "epoch": 0.8846153846153846, "grad_norm": 0.6592712998390198, "kl": 0.08809557557106018, "learning_rate": 4.65185506750986e-06, "loss": 0.0035, "reward": 4.057185649871826, "reward_std": 2.201416015625, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6321854591369629, "rewards/reward_search_strategy": 0.550000011920929, "step": 253 }, { "completion_length": 357.625, "epoch": 0.8881118881118881, "grad_norm": 1.0848559141159058, "kl": 0.17698736488819122, "learning_rate": 4.6473997423266615e-06, "loss": 0.0071, "reward": 2.2025272846221924, "reward_std": 2.108583450317383, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.45252731442451477, "rewards/reward_search_strategy": 0.375, "step": 254 }, { "completion_length": 423.375, "epoch": 0.8916083916083916, "grad_norm": 1.3164327144622803, "kl": 0.20678985118865967, "learning_rate": 4.642918251755281e-06, "loss": 0.0083, "reward": 4.640832901000977, "reward_std": 3.4112045764923096, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5658329129219055, "rewards/reward_search_strategy": 0.44999998807907104, "step": 255 }, { "completion_length": 480.875, "epoch": 0.8951048951048951, "grad_norm": 0.9758729934692383, "kl": 0.12582933902740479, "learning_rate": 4.638410650401267e-06, "loss": 0.005, "reward": 3.9991414546966553, "reward_std": 2.9747655391693115, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4241415858268738, "rewards/reward_search_strategy": 0.45000001788139343, "step": 256 }, { "completion_length": 377.875, "epoch": 0.8986013986013986, "grad_norm": 0.7494639754295349, "kl": 0.12961195409297943, "learning_rate": 4.633876993188319e-06, "loss": 0.0052, "reward": 5.370232582092285, "reward_std": 2.715336322784424, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29523247480392456, "rewards/reward_search_strategy": 0.574999988079071, "step": 257 }, { "completion_length": 446.375, "epoch": 0.9020979020979021, "grad_norm": 0.9032400250434875, "kl": 0.12136492133140564, "learning_rate": 4.62931733535762e-06, "loss": 0.0049, "reward": 1.9620543718338013, "reward_std": 1.8072302341461182, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3620542883872986, "rewards/reward_search_strategy": 0.3500000238418579, "step": 258 }, { "completion_length": 461.625, "epoch": 0.9055944055944056, "grad_norm": 0.8652839660644531, "kl": 0.09421462565660477, "learning_rate": 4.62473173246716e-06, "loss": 0.0038, "reward": 4.801279067993164, "reward_std": 3.08664870262146, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5762789845466614, "rewards/reward_search_strategy": 0.6000000238418579, "step": 259 }, { "completion_length": 373.375, "epoch": 0.9090909090909091, "grad_norm": 1.3319971561431885, "kl": 0.19203314185142517, "learning_rate": 4.620120240391065e-06, "loss": 0.0077, "reward": 3.6981348991394043, "reward_std": 2.8933234214782715, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4981347918510437, "rewards/reward_search_strategy": 0.45000001788139343, "step": 260 }, { "completion_length": 567.875, "epoch": 0.9125874125874126, "grad_norm": 0.39829006791114807, "kl": 0.08746462315320969, "learning_rate": 4.6154829153189105e-06, "loss": 0.0035, "reward": 4.194956302642822, "reward_std": 1.963258981704712, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4949561357498169, "rewards/reward_search_strategy": 0.699999988079071, "step": 261 }, { "completion_length": 493.375, "epoch": 0.916083916083916, "grad_norm": 0.7264673113822937, "kl": 0.09607839584350586, "learning_rate": 4.610819813755038e-06, "loss": 0.0038, "reward": 3.0513057708740234, "reward_std": 1.9612208604812622, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.45130595564842224, "rewards/reward_search_strategy": 0.6000000238418579, "step": 262 }, { "completion_length": 447.25, "epoch": 0.9195804195804196, "grad_norm": 1.1438530683517456, "kl": 0.12347482144832611, "learning_rate": 4.60613099251787e-06, "loss": 0.0049, "reward": 5.763233184814453, "reward_std": 2.4886815547943115, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4132331609725952, "rewards/reward_search_strategy": 0.7250000238418579, "step": 263 }, { "completion_length": 428.375, "epoch": 0.9230769230769231, "grad_norm": 0.8694964051246643, "kl": 0.1385030597448349, "learning_rate": 4.601416508739211e-06, "loss": 0.0055, "reward": 3.2145180702209473, "reward_std": 2.9202535152435303, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3395180106163025, "rewards/reward_search_strategy": 0.625, "step": 264 }, { "completion_length": 381.125, "epoch": 0.9265734265734266, "grad_norm": 1.3948837518692017, "kl": 0.1681864708662033, "learning_rate": 4.596676419863561e-06, "loss": 0.0067, "reward": 6.393776893615723, "reward_std": 3.190995693206787, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.643777072429657, "rewards/reward_search_strategy": 0.75, "step": 265 }, { "completion_length": 394.75, "epoch": 0.9300699300699301, "grad_norm": 0.7370116710662842, "kl": 0.12830950319766998, "learning_rate": 4.591910783647405e-06, "loss": 0.0051, "reward": 5.736630439758301, "reward_std": 3.0993423461914062, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4366300702095032, "rewards/reward_search_strategy": 0.675000011920929, "step": 266 }, { "completion_length": 525.875, "epoch": 0.9335664335664335, "grad_norm": 0.7589148283004761, "kl": 0.11880780756473541, "learning_rate": 4.587119658158517e-06, "loss": 0.0048, "reward": 2.8846940994262695, "reward_std": 1.8504923582077026, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3846941590309143, "rewards/reward_search_strategy": 0.375, "step": 267 }, { "completion_length": 556.625, "epoch": 0.9370629370629371, "grad_norm": 0.7502413392066956, "kl": 0.13426969945430756, "learning_rate": 4.582303101775249e-06, "loss": 0.0054, "reward": 3.693828582763672, "reward_std": 2.5637710094451904, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.41882866621017456, "rewards/reward_search_strategy": 0.5249999761581421, "step": 268 }, { "completion_length": 387.125, "epoch": 0.9405594405594405, "grad_norm": 1.2606431245803833, "kl": 0.15010693669319153, "learning_rate": 4.577461173185821e-06, "loss": 0.006, "reward": 5.423138618469238, "reward_std": 2.955420970916748, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5231384038925171, "rewards/reward_search_strategy": 0.5250000357627869, "step": 269 }, { "completion_length": 544.5, "epoch": 0.9440559440559441, "grad_norm": 117.32878875732422, "kl": 7.282515525817871, "learning_rate": 4.572593931387604e-06, "loss": 0.2913, "reward": 3.634967088699341, "reward_std": 1.6427184343338013, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5599672198295593, "rewards/reward_search_strategy": 0.45000001788139343, "step": 270 }, { "completion_length": 344.625, "epoch": 0.9475524475524476, "grad_norm": 1.2033367156982422, "kl": 0.4850043058395386, "learning_rate": 4.567701435686405e-06, "loss": 0.0194, "reward": 2.6374354362487793, "reward_std": 3.4561848640441895, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.23743543028831482, "rewards/reward_search_strategy": 0.2750000059604645, "step": 271 }, { "completion_length": 450.25, "epoch": 0.951048951048951, "grad_norm": 9.75650691986084, "kl": 4.285886287689209, "learning_rate": 4.562783745695738e-06, "loss": 0.1714, "reward": 2.6909584999084473, "reward_std": 1.9044957160949707, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.490958571434021, "rewards/reward_search_strategy": 0.44999998807907104, "step": 272 }, { "completion_length": 308.375, "epoch": 0.9545454545454546, "grad_norm": 1.5673211812973022, "kl": 0.31804680824279785, "learning_rate": 4.5578409213361055e-06, "loss": 0.0127, "reward": 4.503492832183838, "reward_std": 3.3028485774993896, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5284927487373352, "rewards/reward_search_strategy": 0.4749999940395355, "step": 273 }, { "completion_length": 314.375, "epoch": 0.958041958041958, "grad_norm": 1.0376074314117432, "kl": 0.23754160106182098, "learning_rate": 4.55287302283426e-06, "loss": 0.0095, "reward": 5.486078262329102, "reward_std": 2.5428996086120605, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3610779643058777, "rewards/reward_search_strategy": 0.375, "step": 274 }, { "completion_length": 483.375, "epoch": 0.9615384615384616, "grad_norm": 0.7041484713554382, "kl": 0.15638189017772675, "learning_rate": 4.54788011072248e-06, "loss": 0.0063, "reward": 7.443948745727539, "reward_std": 2.539414882659912, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7939489483833313, "rewards/reward_search_strategy": 0.8999999761581421, "step": 275 }, { "completion_length": 466.75, "epoch": 0.965034965034965, "grad_norm": 0.618880033493042, "kl": 0.132215678691864, "learning_rate": 4.542862245837821e-06, "loss": 0.0053, "reward": 3.8898563385009766, "reward_std": 2.4422481060028076, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.46485650539398193, "rewards/reward_search_strategy": 0.675000011920929, "step": 276 }, { "completion_length": 380.625, "epoch": 0.9685314685314685, "grad_norm": 2.440429210662842, "kl": 0.22437606751918793, "learning_rate": 4.537819489321385e-06, "loss": 0.009, "reward": 3.1129302978515625, "reward_std": 2.365227460861206, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.36293041706085205, "rewards/reward_search_strategy": 0.5, "step": 277 }, { "completion_length": 574.75, "epoch": 0.972027972027972, "grad_norm": 0.5336410999298096, "kl": 0.10619538277387619, "learning_rate": 4.5327519026175694e-06, "loss": 0.0042, "reward": 5.400971412658691, "reward_std": 2.051018714904785, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5259711742401123, "rewards/reward_search_strategy": 0.625, "step": 278 }, { "completion_length": 434.75, "epoch": 0.9755244755244755, "grad_norm": 0.6269961595535278, "kl": 0.12009799480438232, "learning_rate": 4.527659547473317e-06, "loss": 0.0048, "reward": 6.216640949249268, "reward_std": 2.1789255142211914, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49164125323295593, "rewards/reward_search_strategy": 0.6000000238418579, "step": 279 }, { "completion_length": 352.75, "epoch": 0.9790209790209791, "grad_norm": 1.62322199344635, "kl": 0.16065241396427155, "learning_rate": 4.522542485937369e-06, "loss": 0.0064, "reward": 6.019310474395752, "reward_std": 3.3751046657562256, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.36931055784225464, "rewards/reward_search_strategy": 0.7749999761581421, "step": 280 }, { "completion_length": 321.25, "epoch": 0.9825174825174825, "grad_norm": 1.4410455226898193, "kl": 0.16992495954036713, "learning_rate": 4.517400780359505e-06, "loss": 0.0068, "reward": 6.315744400024414, "reward_std": 3.2042458057403564, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.515744686126709, "rewards/reward_search_strategy": 0.6749999523162842, "step": 281 }, { "completion_length": 531.75, "epoch": 0.986013986013986, "grad_norm": 0.589606761932373, "kl": 0.09500578045845032, "learning_rate": 4.512234493389785e-06, "loss": 0.0038, "reward": 5.298929214477539, "reward_std": 2.658466339111328, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6739287376403809, "rewards/reward_search_strategy": 0.625, "step": 282 }, { "completion_length": 408.125, "epoch": 0.9895104895104895, "grad_norm": 2.672626256942749, "kl": 0.25274914503097534, "learning_rate": 4.507043687977787e-06, "loss": 0.0101, "reward": 5.0886993408203125, "reward_std": 2.9135916233062744, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.23869916796684265, "rewards/reward_search_strategy": 0.6000000238418579, "step": 283 }, { "completion_length": 496.5, "epoch": 0.993006993006993, "grad_norm": 0.5412589907646179, "kl": 0.12916360795497894, "learning_rate": 4.501828427371834e-06, "loss": 0.0052, "reward": 3.9245102405548096, "reward_std": 2.256361484527588, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.6245103478431702, "rewards/reward_search_strategy": 0.550000011920929, "step": 284 }, { "completion_length": 400.5, "epoch": 0.9965034965034965, "grad_norm": 0.8593103289604187, "kl": 0.131291002035141, "learning_rate": 4.496588775118232e-06, "loss": 0.0053, "reward": 5.690485954284668, "reward_std": 3.571183204650879, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.46548622846603394, "rewards/reward_search_strategy": 0.6000000238418579, "step": 285 }, { "completion_length": 564.25, "epoch": 1.0, "grad_norm": 0.6153562664985657, "kl": 0.09146779030561447, "learning_rate": 4.491324795060491e-06, "loss": 0.0037, "reward": 6.105203628540039, "reward_std": 2.1778714656829834, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6802035570144653, "rewards/reward_search_strategy": 0.7999999523162842, "step": 286 }, { "completion_length": 334.25, "epoch": 1.0034965034965035, "grad_norm": 1.5982016324996948, "kl": 0.27556726336479187, "learning_rate": 4.4860365513385456e-06, "loss": 0.011, "reward": 6.495705604553223, "reward_std": 3.424804925918579, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.37070590257644653, "rewards/reward_search_strategy": 0.75, "step": 287 }, { "completion_length": 327.5, "epoch": 1.006993006993007, "grad_norm": 0.9249085187911987, "kl": 0.13310858607292175, "learning_rate": 4.4807241083879774e-06, "loss": 0.0053, "reward": 4.291748523712158, "reward_std": 3.5267059803009033, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4167482554912567, "rewards/reward_search_strategy": 0.625, "step": 288 }, { "completion_length": 353.375, "epoch": 1.0104895104895104, "grad_norm": 1.0354520082473755, "kl": 0.13781802356243134, "learning_rate": 4.475387530939226e-06, "loss": 0.0055, "reward": 6.14892578125, "reward_std": 3.4906580448150635, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5739257335662842, "rewards/reward_search_strategy": 0.699999988079071, "step": 289 }, { "completion_length": 442.0, "epoch": 1.013986013986014, "grad_norm": 0.5650519132614136, "kl": 0.09590885043144226, "learning_rate": 4.470026884016805e-06, "loss": 0.0038, "reward": 6.06260347366333, "reward_std": 2.7577667236328125, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5376033782958984, "rewards/reward_search_strategy": 0.7749999761581421, "step": 290 }, { "completion_length": 442.0, "epoch": 1.0174825174825175, "grad_norm": 0.5313113331794739, "kl": 0.10662511736154556, "learning_rate": 4.464642232938505e-06, "loss": 0.0043, "reward": 5.261750221252441, "reward_std": 2.6503186225891113, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48675042390823364, "rewards/reward_search_strategy": 0.5249999761581421, "step": 291 }, { "completion_length": 182.5, "epoch": 1.020979020979021, "grad_norm": 1.577793002128601, "kl": 0.33014675974845886, "learning_rate": 4.4592336433146e-06, "loss": 0.0132, "reward": 3.2789976596832275, "reward_std": 2.985043525695801, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.1039976254105568, "rewards/reward_search_strategy": 0.42500001192092896, "step": 292 }, { "completion_length": 331.125, "epoch": 1.0244755244755244, "grad_norm": 0.7017413973808289, "kl": 0.1622174084186554, "learning_rate": 4.453801181047047e-06, "loss": 0.0065, "reward": 7.63859224319458, "reward_std": 1.5380568504333496, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6885923147201538, "rewards/reward_search_strategy": 0.7000000476837158, "step": 293 }, { "completion_length": 438.125, "epoch": 1.027972027972028, "grad_norm": 2.848461866378784, "kl": 0.12165224552154541, "learning_rate": 4.448344912328686e-06, "loss": 0.0049, "reward": 3.944343090057373, "reward_std": 2.368406057357788, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.2943430542945862, "rewards/reward_search_strategy": 0.6500000357627869, "step": 294 }, { "completion_length": 472.125, "epoch": 1.0314685314685315, "grad_norm": 1.3413972854614258, "kl": 0.1494167149066925, "learning_rate": 4.442864903642428e-06, "loss": 0.006, "reward": 4.690557956695557, "reward_std": 2.62575626373291, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49055802822113037, "rewards/reward_search_strategy": 0.574999988079071, "step": 295 }, { "completion_length": 493.125, "epoch": 1.034965034965035, "grad_norm": 1.2033652067184448, "kl": 0.17120806872844696, "learning_rate": 4.437361221760449e-06, "loss": 0.0068, "reward": 5.665086269378662, "reward_std": 3.6448543071746826, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.46508604288101196, "rewards/reward_search_strategy": 0.574999988079071, "step": 296 }, { "completion_length": 594.125, "epoch": 1.0384615384615385, "grad_norm": 0.5311799049377441, "kl": 0.07367473095655441, "learning_rate": 4.431833933743378e-06, "loss": 0.0029, "reward": 4.792994499206543, "reward_std": 1.6828709840774536, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5179944634437561, "rewards/reward_search_strategy": 0.5249999761581421, "step": 297 }, { "completion_length": 528.75, "epoch": 1.0419580419580419, "grad_norm": 0.5492209196090698, "kl": 0.13790830969810486, "learning_rate": 4.426283106939474e-06, "loss": 0.0055, "reward": 5.346704959869385, "reward_std": 2.976729393005371, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6967048645019531, "rewards/reward_search_strategy": 0.6500000357627869, "step": 298 }, { "completion_length": 352.0, "epoch": 1.0454545454545454, "grad_norm": 6.345902919769287, "kl": 8.095924377441406, "learning_rate": 4.420708808983809e-06, "loss": 0.3238, "reward": 6.351449966430664, "reward_std": 3.353184938430786, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5514500141143799, "rewards/reward_search_strategy": 0.675000011920929, "step": 299 }, { "completion_length": 431.25, "epoch": 1.048951048951049, "grad_norm": 0.7204697132110596, "kl": 0.13538429141044617, "learning_rate": 4.415111107797445e-06, "loss": 0.0054, "reward": 5.675325393676758, "reward_std": 1.948087453842163, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6003252267837524, "rewards/reward_search_strategy": 0.5750000476837158, "step": 300 }, { "completion_length": 329.0, "epoch": 1.0524475524475525, "grad_norm": 0.9927306175231934, "kl": 0.12998303771018982, "learning_rate": 4.409490071586606e-06, "loss": 0.0052, "reward": 4.4069647789001465, "reward_std": 3.4473555088043213, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.2819647789001465, "rewards/reward_search_strategy": 0.625, "step": 301 }, { "completion_length": 307.75, "epoch": 1.055944055944056, "grad_norm": 1.6803146600723267, "kl": 0.1856156587600708, "learning_rate": 4.403845768841842e-06, "loss": 0.0074, "reward": 4.0473809242248535, "reward_std": 3.1361262798309326, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3223809003829956, "rewards/reward_search_strategy": 0.4750000238418579, "step": 302 }, { "completion_length": 539.0, "epoch": 1.0594405594405594, "grad_norm": 0.5144261717796326, "kl": 0.09425308555364609, "learning_rate": 4.398178268337202e-06, "loss": 0.0038, "reward": 5.513421535491943, "reward_std": 2.786677837371826, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5884215831756592, "rewards/reward_search_strategy": 0.550000011920929, "step": 303 }, { "completion_length": 483.625, "epoch": 1.062937062937063, "grad_norm": 0.6803086400032043, "kl": 0.09594997018575668, "learning_rate": 4.3924876391293915e-06, "loss": 0.0038, "reward": 5.4244794845581055, "reward_std": 2.6390738487243652, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5994799733161926, "rewards/reward_search_strategy": 0.574999988079071, "step": 304 }, { "completion_length": 559.25, "epoch": 1.0664335664335665, "grad_norm": 0.5547813773155212, "kl": 0.1060553565621376, "learning_rate": 4.386773950556931e-06, "loss": 0.0042, "reward": 5.821169853210449, "reward_std": 2.7936391830444336, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7211699485778809, "rewards/reward_search_strategy": 0.7249999642372131, "step": 305 }, { "completion_length": 580.375, "epoch": 1.06993006993007, "grad_norm": 0.5855110883712769, "kl": 0.14783963561058044, "learning_rate": 4.381037272239311e-06, "loss": 0.0059, "reward": 6.188364028930664, "reward_std": 2.4008665084838867, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5883639454841614, "rewards/reward_search_strategy": 0.6000000238418579, "step": 306 }, { "completion_length": 447.125, "epoch": 1.0734265734265733, "grad_norm": 1.4545259475708008, "kl": 0.2002858966588974, "learning_rate": 4.3752776740761495e-06, "loss": 0.008, "reward": 5.376095771789551, "reward_std": 3.676356315612793, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.576095700263977, "rewards/reward_search_strategy": 0.675000011920929, "step": 307 }, { "completion_length": 539.5, "epoch": 1.0769230769230769, "grad_norm": 11.574174880981445, "kl": 2.8931026458740234, "learning_rate": 4.36949522624633e-06, "loss": 0.1157, "reward": 4.711579322814941, "reward_std": 2.978994131088257, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.7115795612335205, "rewards/reward_search_strategy": 0.625, "step": 308 }, { "completion_length": 297.625, "epoch": 1.0804195804195804, "grad_norm": 2.2476742267608643, "kl": 0.18139171600341797, "learning_rate": 4.3636899992071555e-06, "loss": 0.0073, "reward": 7.508697986602783, "reward_std": 3.040250062942505, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5586981773376465, "rewards/reward_search_strategy": 0.824999988079071, "step": 309 }, { "completion_length": 361.625, "epoch": 1.083916083916084, "grad_norm": 0.5807898640632629, "kl": 0.139273002743721, "learning_rate": 4.357862063693486e-06, "loss": 0.0056, "reward": 6.986447334289551, "reward_std": 2.1157562732696533, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5114474296569824, "rewards/reward_search_strategy": 0.7250000238418579, "step": 310 }, { "completion_length": 567.875, "epoch": 1.0874125874125875, "grad_norm": 0.5602300763130188, "kl": 0.1023474633693695, "learning_rate": 4.352011490716875e-06, "loss": 0.0041, "reward": 5.648248672485352, "reward_std": 2.6205856800079346, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5982489585876465, "rewards/reward_search_strategy": 0.675000011920929, "step": 311 }, { "completion_length": 401.5, "epoch": 1.0909090909090908, "grad_norm": 0.6656911969184875, "kl": 0.10482494533061981, "learning_rate": 4.346138351564711e-06, "loss": 0.0042, "reward": 5.354951858520508, "reward_std": 2.9195470809936523, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6049516201019287, "rewards/reward_search_strategy": 0.75, "step": 312 }, { "completion_length": 490.5, "epoch": 1.0944055944055944, "grad_norm": 0.6362817287445068, "kl": 0.07986684143543243, "learning_rate": 4.340242717799337e-06, "loss": 0.0032, "reward": 5.589838981628418, "reward_std": 2.4936347007751465, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.41483867168426514, "rewards/reward_search_strategy": 0.675000011920929, "step": 313 }, { "completion_length": 395.25, "epoch": 1.097902097902098, "grad_norm": 1.8932520151138306, "kl": 0.1983058750629425, "learning_rate": 4.334324661257191e-06, "loss": 0.0079, "reward": 4.26251220703125, "reward_std": 3.2981317043304443, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43751221895217896, "rewards/reward_search_strategy": 0.5750000476837158, "step": 314 }, { "completion_length": 454.0, "epoch": 1.1013986013986015, "grad_norm": 0.5324206352233887, "kl": 0.08979379385709763, "learning_rate": 4.328384254047927e-06, "loss": 0.0036, "reward": 7.345250129699707, "reward_std": 1.8361765146255493, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3702499270439148, "rewards/reward_search_strategy": 0.7250000238418579, "step": 315 }, { "completion_length": 269.625, "epoch": 1.104895104895105, "grad_norm": 1.358657717704773, "kl": 0.16482065618038177, "learning_rate": 4.322421568553529e-06, "loss": 0.0066, "reward": 5.7101945877075195, "reward_std": 2.8660097122192383, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3851943910121918, "rewards/reward_search_strategy": 0.574999988079071, "step": 316 }, { "completion_length": 426.0, "epoch": 1.1083916083916083, "grad_norm": 0.8824137449264526, "kl": 0.20691236853599548, "learning_rate": 4.316436677427441e-06, "loss": 0.0083, "reward": 5.294474124908447, "reward_std": 2.200099229812622, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5194741487503052, "rewards/reward_search_strategy": 0.6499999761581421, "step": 317 }, { "completion_length": 449.5, "epoch": 1.1118881118881119, "grad_norm": 0.6172366738319397, "kl": 0.07651077210903168, "learning_rate": 4.3104296535936695e-06, "loss": 0.0031, "reward": 4.2370381355285645, "reward_std": 2.9659953117370605, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4870380759239197, "rewards/reward_search_strategy": 0.5, "step": 318 }, { "completion_length": 535.75, "epoch": 1.1153846153846154, "grad_norm": 0.6779645085334778, "kl": 0.08398789167404175, "learning_rate": 4.3044005702459055e-06, "loss": 0.0034, "reward": 5.004274845123291, "reward_std": 2.3385448455810547, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4542747437953949, "rewards/reward_search_strategy": 0.5500000715255737, "step": 319 }, { "completion_length": 401.25, "epoch": 1.118881118881119, "grad_norm": 2.2599902153015137, "kl": 0.16237607598304749, "learning_rate": 4.2983495008466285e-06, "loss": 0.0065, "reward": 5.627570629119873, "reward_std": 3.1766750812530518, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3525705933570862, "rewards/reward_search_strategy": 0.7749999761581421, "step": 320 }, { "completion_length": 345.0, "epoch": 1.1223776223776223, "grad_norm": 0.6874108910560608, "kl": 0.07024817913770676, "learning_rate": 4.2922765191262075e-06, "loss": 0.0028, "reward": 6.876392841339111, "reward_std": 2.1439974308013916, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5263932347297668, "rewards/reward_search_strategy": 0.8499999642372131, "step": 321 }, { "completion_length": 640.375, "epoch": 1.1258741258741258, "grad_norm": 0.5233185887336731, "kl": 0.05953974276781082, "learning_rate": 4.286181699082008e-06, "loss": 0.0024, "reward": 5.468829154968262, "reward_std": 2.4537880420684814, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.44382914900779724, "rewards/reward_search_strategy": 0.7749999761581421, "step": 322 }, { "completion_length": 425.5, "epoch": 1.1293706293706294, "grad_norm": 0.5920670628547668, "kl": 0.06298626214265823, "learning_rate": 4.280065114977492e-06, "loss": 0.0025, "reward": 2.663602590560913, "reward_std": 2.2890970706939697, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5136026740074158, "rewards/reward_search_strategy": 0.6500000357627869, "step": 323 }, { "completion_length": 406.875, "epoch": 1.132867132867133, "grad_norm": 0.6604630947113037, "kl": 0.08841750025749207, "learning_rate": 4.273926841341303e-06, "loss": 0.0035, "reward": 5.190492153167725, "reward_std": 2.3081531524658203, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5654920339584351, "rewards/reward_search_strategy": 0.75, "step": 324 }, { "completion_length": 367.375, "epoch": 1.1363636363636362, "grad_norm": 6.282087802886963, "kl": 3.9899542331695557, "learning_rate": 4.267766952966369e-06, "loss": 0.1596, "reward": 3.679385185241699, "reward_std": 3.0263350009918213, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.45438525080680847, "rewards/reward_search_strategy": 0.4749999940395355, "step": 325 }, { "completion_length": 544.25, "epoch": 1.1398601398601398, "grad_norm": 1.237210988998413, "kl": 0.1231246143579483, "learning_rate": 4.261585524908987e-06, "loss": 0.0049, "reward": 3.501136064529419, "reward_std": 2.881526231765747, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4511359930038452, "rewards/reward_search_strategy": 0.675000011920929, "step": 326 }, { "completion_length": 564.0, "epoch": 1.1433566433566433, "grad_norm": 0.504810094833374, "kl": 0.09063772112131119, "learning_rate": 4.255382632487907e-06, "loss": 0.0036, "reward": 3.910247325897217, "reward_std": 1.80642831325531, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5602472424507141, "rewards/reward_search_strategy": 0.4750000238418579, "step": 327 }, { "completion_length": 374.5, "epoch": 1.1468531468531469, "grad_norm": 0.7342034578323364, "kl": 0.08345510065555573, "learning_rate": 4.249158351283414e-06, "loss": 0.0033, "reward": 5.706239700317383, "reward_std": 3.2305665016174316, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.53123939037323, "rewards/reward_search_strategy": 0.675000011920929, "step": 328 }, { "completion_length": 527.25, "epoch": 1.1503496503496504, "grad_norm": 0.46101129055023193, "kl": 0.07814895361661911, "learning_rate": 4.242912757136412e-06, "loss": 0.0031, "reward": 3.4161453247070312, "reward_std": 2.4508216381073, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5411453247070312, "rewards/reward_search_strategy": 0.5, "step": 329 }, { "completion_length": 363.875, "epoch": 1.1538461538461537, "grad_norm": 0.8629273176193237, "kl": 0.1023842915892601, "learning_rate": 4.236645926147493e-06, "loss": 0.0041, "reward": 5.935527324676514, "reward_std": 2.9570200443267822, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5355273485183716, "rewards/reward_search_strategy": 0.7749999761581421, "step": 330 }, { "completion_length": 523.25, "epoch": 1.1573426573426573, "grad_norm": 0.4914082884788513, "kl": 0.07481839507818222, "learning_rate": 4.230357934676017e-06, "loss": 0.003, "reward": 5.845967769622803, "reward_std": 2.551992177963257, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.495967835187912, "rewards/reward_search_strategy": 0.7249999642372131, "step": 331 }, { "completion_length": 382.75, "epoch": 1.1608391608391608, "grad_norm": 0.541487455368042, "kl": 0.11320384591817856, "learning_rate": 4.224048859339175e-06, "loss": 0.0045, "reward": 6.013635635375977, "reward_std": 2.582735300064087, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3386358618736267, "rewards/reward_search_strategy": 0.675000011920929, "step": 332 }, { "completion_length": 728.125, "epoch": 1.1643356643356644, "grad_norm": 0.42518407106399536, "kl": 0.047705646604299545, "learning_rate": 4.217718777011058e-06, "loss": 0.0019, "reward": 3.8004541397094727, "reward_std": 2.1054131984710693, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.35045403242111206, "rewards/reward_search_strategy": 0.699999988079071, "step": 333 }, { "completion_length": 310.375, "epoch": 1.167832167832168, "grad_norm": 1.1916086673736572, "kl": 0.15560080111026764, "learning_rate": 4.211367764821722e-06, "loss": 0.0062, "reward": 5.576718330383301, "reward_std": 3.0815911293029785, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.30171847343444824, "rewards/reward_search_strategy": 0.6500000357627869, "step": 334 }, { "completion_length": 221.25, "epoch": 1.1713286713286712, "grad_norm": 9.67128849029541, "kl": 0.7930659651756287, "learning_rate": 4.204995900156247e-06, "loss": 0.0317, "reward": 6.268731117248535, "reward_std": 3.307931423187256, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3937305808067322, "rewards/reward_search_strategy": 0.625, "step": 335 }, { "completion_length": 369.75, "epoch": 1.1748251748251748, "grad_norm": 1.1554738283157349, "kl": 0.13195408880710602, "learning_rate": 4.198603260653792e-06, "loss": 0.0053, "reward": 4.597299575805664, "reward_std": 2.33955979347229, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4972996115684509, "rewards/reward_search_strategy": 0.4750000238418579, "step": 336 }, { "completion_length": 411.25, "epoch": 1.1783216783216783, "grad_norm": 0.7392211556434631, "kl": 0.0733492374420166, "learning_rate": 4.192189924206652e-06, "loss": 0.0029, "reward": 4.321833610534668, "reward_std": 2.759119987487793, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.521833598613739, "rewards/reward_search_strategy": 0.800000011920929, "step": 337 }, { "completion_length": 451.875, "epoch": 1.1818181818181819, "grad_norm": 1.921710729598999, "kl": 0.21552717685699463, "learning_rate": 4.185755968959308e-06, "loss": 0.0086, "reward": 4.766526222229004, "reward_std": 2.58585524559021, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5915263891220093, "rewards/reward_search_strategy": 0.675000011920929, "step": 338 }, { "completion_length": 332.25, "epoch": 1.1853146853146854, "grad_norm": 1.0764071941375732, "kl": 0.11709711700677872, "learning_rate": 4.179301473307476e-06, "loss": 0.0047, "reward": 5.342533111572266, "reward_std": 3.2703137397766113, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5675327777862549, "rewards/reward_search_strategy": 0.7749999761581421, "step": 339 }, { "completion_length": 496.5, "epoch": 1.1888111888111887, "grad_norm": 0.7248229384422302, "kl": 0.13478195667266846, "learning_rate": 4.172826515897146e-06, "loss": 0.0054, "reward": 6.131617546081543, "reward_std": 2.5291075706481934, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4816179871559143, "rewards/reward_search_strategy": 0.6500000357627869, "step": 340 }, { "completion_length": 392.75, "epoch": 1.1923076923076923, "grad_norm": 0.6313228011131287, "kl": 0.08677749335765839, "learning_rate": 4.166331175623631e-06, "loss": 0.0035, "reward": 5.503582000732422, "reward_std": 2.0927414894104004, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5785820484161377, "rewards/reward_search_strategy": 0.7999999523162842, "step": 341 }, { "completion_length": 358.75, "epoch": 1.1958041958041958, "grad_norm": 0.5460852384567261, "kl": 0.13055555522441864, "learning_rate": 4.159815531630604e-06, "loss": 0.0052, "reward": 7.100890159606934, "reward_std": 2.1578822135925293, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4008905589580536, "rewards/reward_search_strategy": 0.699999988079071, "step": 342 }, { "completion_length": 228.375, "epoch": 1.1993006993006994, "grad_norm": 0.8770285248756409, "kl": 0.1582285761833191, "learning_rate": 4.15327966330913e-06, "loss": 0.0063, "reward": 6.754122734069824, "reward_std": 3.0570099353790283, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5791229009628296, "rewards/reward_search_strategy": 0.675000011920929, "step": 343 }, { "completion_length": 397.75, "epoch": 1.2027972027972027, "grad_norm": 0.9353595972061157, "kl": 0.21055281162261963, "learning_rate": 4.146723650296701e-06, "loss": 0.0084, "reward": 5.746088981628418, "reward_std": 2.988830327987671, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4960886836051941, "rewards/reward_search_strategy": 0.75, "step": 344 }, { "completion_length": 376.5, "epoch": 1.2062937062937062, "grad_norm": 0.7754123210906982, "kl": 0.10437698662281036, "learning_rate": 4.140147572476269e-06, "loss": 0.0042, "reward": 3.666969060897827, "reward_std": 4.023746490478516, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2669692635536194, "rewards/reward_search_strategy": 0.4000000059604645, "step": 345 }, { "completion_length": 435.625, "epoch": 1.2097902097902098, "grad_norm": 0.9024556875228882, "kl": 0.09636207669973373, "learning_rate": 4.133551509975264e-06, "loss": 0.0039, "reward": 4.0536298751831055, "reward_std": 3.3120667934417725, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5036299824714661, "rewards/reward_search_strategy": 0.550000011920929, "step": 346 }, { "completion_length": 234.375, "epoch": 1.2132867132867133, "grad_norm": 0.8678302764892578, "kl": 0.1372126191854477, "learning_rate": 4.126935543164628e-06, "loss": 0.0055, "reward": 5.475516319274902, "reward_std": 3.4662699699401855, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.45051607489585876, "rewards/reward_search_strategy": 0.6499999761581421, "step": 347 }, { "completion_length": 268.5, "epoch": 1.2167832167832167, "grad_norm": 1.028032660484314, "kl": 0.21305130422115326, "learning_rate": 4.120299752657828e-06, "loss": 0.0085, "reward": 5.076361656188965, "reward_std": 4.117757797241211, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4513613283634186, "rewards/reward_search_strategy": 0.625, "step": 348 }, { "completion_length": 510.375, "epoch": 1.2202797202797202, "grad_norm": 0.9544126391410828, "kl": 0.09230495244264603, "learning_rate": 4.113644219309877e-06, "loss": 0.0037, "reward": 6.58498477935791, "reward_std": 2.8294339179992676, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6849843263626099, "rewards/reward_search_strategy": 0.6499999761581421, "step": 349 }, { "completion_length": 312.875, "epoch": 1.2237762237762237, "grad_norm": 2.1660711765289307, "kl": 0.22974209487438202, "learning_rate": 4.106969024216348e-06, "loss": 0.0092, "reward": 6.715004920959473, "reward_std": 2.784714937210083, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.49000489711761475, "rewards/reward_search_strategy": 0.7250000238418579, "step": 350 }, { "completion_length": 348.25, "epoch": 1.2272727272727273, "grad_norm": 0.9564335346221924, "kl": 0.20949901640415192, "learning_rate": 4.1002742487123896e-06, "loss": 0.0084, "reward": 4.565638542175293, "reward_std": 2.672559976577759, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.39063847064971924, "rewards/reward_search_strategy": 0.675000011920929, "step": 351 }, { "completion_length": 479.5, "epoch": 1.2307692307692308, "grad_norm": 0.6322876214981079, "kl": 0.07695091515779495, "learning_rate": 4.093559974371725e-06, "loss": 0.0031, "reward": 6.6676716804504395, "reward_std": 2.6538872718811035, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5926718711853027, "rewards/reward_search_strategy": 0.8250000476837158, "step": 352 }, { "completion_length": 494.75, "epoch": 1.2342657342657342, "grad_norm": 0.4909517168998718, "kl": 0.07885551452636719, "learning_rate": 4.086826283005669e-06, "loss": 0.0032, "reward": 6.1619181632995605, "reward_std": 2.8411340713500977, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3369182050228119, "rewards/reward_search_strategy": 0.699999988079071, "step": 353 }, { "completion_length": 260.25, "epoch": 1.2377622377622377, "grad_norm": 1.253501057624817, "kl": 0.20247994363307953, "learning_rate": 4.080073256662128e-06, "loss": 0.0081, "reward": 5.574455261230469, "reward_std": 3.5395596027374268, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3494548499584198, "rewards/reward_search_strategy": 0.6000000238418579, "step": 354 }, { "completion_length": 309.125, "epoch": 1.2412587412587412, "grad_norm": 1.7126333713531494, "kl": 0.21196775138378143, "learning_rate": 4.073300977624594e-06, "loss": 0.0085, "reward": 7.307598114013672, "reward_std": 2.9741148948669434, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5575979948043823, "rewards/reward_search_strategy": 0.625, "step": 355 }, { "completion_length": 405.0, "epoch": 1.2447552447552448, "grad_norm": 1.082837462425232, "kl": 0.07227756828069687, "learning_rate": 4.066509528411151e-06, "loss": 0.0029, "reward": 5.161370277404785, "reward_std": 3.016519546508789, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.43637019395828247, "rewards/reward_search_strategy": 0.7249999642372131, "step": 356 }, { "completion_length": 376.25, "epoch": 1.2482517482517483, "grad_norm": 1.099038004875183, "kl": 0.13722370564937592, "learning_rate": 4.059698991773466e-06, "loss": 0.0055, "reward": 3.3820579051971436, "reward_std": 2.766157388687134, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.40705791115760803, "rewards/reward_search_strategy": 0.4749999940395355, "step": 357 }, { "completion_length": 532.625, "epoch": 1.2517482517482517, "grad_norm": 0.6121649146080017, "kl": 0.05823325365781784, "learning_rate": 4.052869450695776e-06, "loss": 0.0023, "reward": 4.155327796936035, "reward_std": 3.0830624103546143, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.530327558517456, "rewards/reward_search_strategy": 0.5, "step": 358 }, { "completion_length": 534.875, "epoch": 1.2552447552447552, "grad_norm": 0.5585082173347473, "kl": 0.07693375647068024, "learning_rate": 4.046020988393886e-06, "loss": 0.0031, "reward": 4.022989273071289, "reward_std": 1.8042207956314087, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3729895353317261, "rewards/reward_search_strategy": 0.6500000357627869, "step": 359 }, { "completion_length": 404.125, "epoch": 1.2587412587412588, "grad_norm": 0.5998912453651428, "kl": 0.08919445425271988, "learning_rate": 4.039153688314146e-06, "loss": 0.0036, "reward": 6.726578712463379, "reward_std": 2.5917811393737793, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.35157859325408936, "rewards/reward_search_strategy": 0.875, "step": 360 }, { "completion_length": 406.5, "epoch": 1.2622377622377623, "grad_norm": 1.4461448192596436, "kl": 0.12945714592933655, "learning_rate": 4.032267634132442e-06, "loss": 0.0052, "reward": 6.075223922729492, "reward_std": 3.8267555236816406, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47522372007369995, "rewards/reward_search_strategy": 0.6000000238418579, "step": 361 }, { "completion_length": 490.25, "epoch": 1.2657342657342658, "grad_norm": 0.5157952308654785, "kl": 0.08816548436880112, "learning_rate": 4.02536290975317e-06, "loss": 0.0035, "reward": 7.19053316116333, "reward_std": 2.843262195587158, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.640533447265625, "rewards/reward_search_strategy": 0.800000011920929, "step": 362 }, { "completion_length": 320.375, "epoch": 1.2692307692307692, "grad_norm": 11.697229385375977, "kl": 0.3807832598686218, "learning_rate": 4.018439599308217e-06, "loss": 0.0152, "reward": 6.845804214477539, "reward_std": 3.1374518871307373, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.42080387473106384, "rewards/reward_search_strategy": 0.800000011920929, "step": 363 }, { "completion_length": 307.75, "epoch": 1.2727272727272727, "grad_norm": 0.8195920586585999, "kl": 0.2704997956752777, "learning_rate": 4.011497787155938e-06, "loss": 0.0108, "reward": 5.673530101776123, "reward_std": 2.617912769317627, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.34852996468544006, "rewards/reward_search_strategy": 0.44999998807907104, "step": 364 }, { "completion_length": 284.0, "epoch": 1.2762237762237763, "grad_norm": 1.018688678741455, "kl": 0.14098995923995972, "learning_rate": 4.0045375578801216e-06, "loss": 0.0056, "reward": 5.075797080993652, "reward_std": 3.588682174682617, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3757967948913574, "rewards/reward_search_strategy": 0.44999998807907104, "step": 365 }, { "completion_length": 364.875, "epoch": 1.2797202797202798, "grad_norm": 0.9165819883346558, "kl": 0.14407169818878174, "learning_rate": 3.997558996288965e-06, "loss": 0.0058, "reward": 5.143270492553711, "reward_std": 3.304353713989258, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5432703495025635, "rewards/reward_search_strategy": 0.4750000238418579, "step": 366 }, { "completion_length": 225.375, "epoch": 1.2832167832167833, "grad_norm": 1.225709319114685, "kl": 0.20389658212661743, "learning_rate": 3.9905621874140396e-06, "loss": 0.0082, "reward": 5.480903625488281, "reward_std": 3.1632080078125, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.455904096364975, "rewards/reward_search_strategy": 0.40000003576278687, "step": 367 }, { "completion_length": 338.875, "epoch": 1.2867132867132867, "grad_norm": 1.0578041076660156, "kl": 0.16953974962234497, "learning_rate": 3.983547216509254e-06, "loss": 0.0068, "reward": 5.733012676239014, "reward_std": 3.0423076152801514, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3580129146575928, "rewards/reward_search_strategy": 0.625, "step": 368 }, { "completion_length": 402.375, "epoch": 1.2902097902097902, "grad_norm": 1.284543752670288, "kl": 0.14602556824684143, "learning_rate": 3.976514169049814e-06, "loss": 0.0058, "reward": 6.073895454406738, "reward_std": 3.7657344341278076, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.42389529943466187, "rewards/reward_search_strategy": 0.6499999761581421, "step": 369 }, { "completion_length": 418.75, "epoch": 1.2937062937062938, "grad_norm": 1.140341877937317, "kl": 0.24927854537963867, "learning_rate": 3.969463130731183e-06, "loss": 0.01, "reward": 3.9747872352600098, "reward_std": 2.9689948558807373, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5747873783111572, "rewards/reward_search_strategy": 0.7749999761581421, "step": 370 }, { "completion_length": 278.25, "epoch": 1.297202797202797, "grad_norm": 1.2817490100860596, "kl": 0.3547288179397583, "learning_rate": 3.96239418746804e-06, "loss": 0.0142, "reward": 4.307415008544922, "reward_std": 2.03360915184021, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4324150085449219, "rewards/reward_search_strategy": 0.625, "step": 371 }, { "completion_length": 383.5, "epoch": 1.3006993006993006, "grad_norm": 0.7459798455238342, "kl": 0.17955949902534485, "learning_rate": 3.955307425393224e-06, "loss": 0.0072, "reward": 5.501554489135742, "reward_std": 3.2705891132354736, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5265547633171082, "rewards/reward_search_strategy": 0.7249999642372131, "step": 372 }, { "completion_length": 309.125, "epoch": 1.3041958041958042, "grad_norm": 1.1568890810012817, "kl": 0.16535663604736328, "learning_rate": 3.948202930856697e-06, "loss": 0.0066, "reward": 6.784092903137207, "reward_std": 3.0337963104248047, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48409315943717957, "rewards/reward_search_strategy": 0.7999999523162842, "step": 373 }, { "completion_length": 295.375, "epoch": 1.3076923076923077, "grad_norm": 0.7384740710258484, "kl": 0.215895876288414, "learning_rate": 3.941080790424483e-06, "loss": 0.0086, "reward": 7.188152313232422, "reward_std": 2.466623067855835, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6631525754928589, "rewards/reward_search_strategy": 0.7749999761581421, "step": 374 }, { "completion_length": 472.25, "epoch": 1.3111888111888113, "grad_norm": 0.5587518215179443, "kl": 0.11894085258245468, "learning_rate": 3.933941090877615e-06, "loss": 0.0048, "reward": 5.4161481857299805, "reward_std": 2.650923728942871, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5661482810974121, "rewards/reward_search_strategy": 0.8499999642372131, "step": 375 }, { "completion_length": 246.75, "epoch": 1.3146853146853146, "grad_norm": 0.9946545362472534, "kl": 0.1861283779144287, "learning_rate": 3.92678391921108e-06, "loss": 0.0074, "reward": 6.748879432678223, "reward_std": 3.111686944961548, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3488798141479492, "rewards/reward_search_strategy": 0.7749999761581421, "step": 376 }, { "completion_length": 281.75, "epoch": 1.3181818181818181, "grad_norm": 11.089983940124512, "kl": 4.762326240539551, "learning_rate": 3.9196093626327535e-06, "loss": 0.1905, "reward": 4.622366905212402, "reward_std": 2.756638288497925, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5723665952682495, "rewards/reward_search_strategy": 0.800000011920929, "step": 377 }, { "completion_length": 346.75, "epoch": 1.3216783216783217, "grad_norm": 0.6730005741119385, "kl": 0.1391105055809021, "learning_rate": 3.912417508562345e-06, "loss": 0.0056, "reward": 4.503476142883301, "reward_std": 2.2465193271636963, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6034764647483826, "rewards/reward_search_strategy": 0.6500000357627869, "step": 378 }, { "completion_length": 473.125, "epoch": 1.3251748251748252, "grad_norm": 0.8451654314994812, "kl": 0.12520407140254974, "learning_rate": 3.905208444630326e-06, "loss": 0.005, "reward": 4.186497688293457, "reward_std": 3.0242393016815186, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5614977478981018, "rewards/reward_search_strategy": 0.5, "step": 379 }, { "completion_length": 246.25, "epoch": 1.3286713286713288, "grad_norm": 0.9249701499938965, "kl": 0.2778664827346802, "learning_rate": 3.897982258676867e-06, "loss": 0.0111, "reward": 5.40871000289917, "reward_std": 3.004138231277466, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5337100028991699, "rewards/reward_search_strategy": 0.625, "step": 380 }, { "completion_length": 502.75, "epoch": 1.332167832167832, "grad_norm": 0.46744218468666077, "kl": 0.09883429110050201, "learning_rate": 3.890739038750763e-06, "loss": 0.004, "reward": 4.416123390197754, "reward_std": 1.8906164169311523, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6411235332489014, "rewards/reward_search_strategy": 0.7749999761581421, "step": 381 }, { "completion_length": 249.625, "epoch": 1.3356643356643356, "grad_norm": 0.8567023873329163, "kl": 0.16580483317375183, "learning_rate": 3.88347887310836e-06, "loss": 0.0066, "reward": 7.747579097747803, "reward_std": 1.4630359411239624, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5475790500640869, "rewards/reward_search_strategy": 0.824999988079071, "step": 382 }, { "completion_length": 283.5, "epoch": 1.3391608391608392, "grad_norm": 1.3123666048049927, "kl": 0.2037430703639984, "learning_rate": 3.876201850212489e-06, "loss": 0.0081, "reward": 6.500353813171387, "reward_std": 3.7146830558776855, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40035414695739746, "rewards/reward_search_strategy": 0.7250000238418579, "step": 383 }, { "completion_length": 519.625, "epoch": 1.3426573426573427, "grad_norm": 1.0492886304855347, "kl": 0.09502134472131729, "learning_rate": 3.868908058731376e-06, "loss": 0.0038, "reward": 4.339066505432129, "reward_std": 2.7583582401275635, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3890661597251892, "rewards/reward_search_strategy": 0.574999988079071, "step": 384 }, { "completion_length": 439.5, "epoch": 1.3461538461538463, "grad_norm": 0.824394166469574, "kl": 0.09405747056007385, "learning_rate": 3.861597587537568e-06, "loss": 0.0038, "reward": 3.9054057598114014, "reward_std": 2.7032172679901123, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.2804058790206909, "rewards/reward_search_strategy": 0.625, "step": 385 }, { "completion_length": 357.25, "epoch": 1.3496503496503496, "grad_norm": 0.8456993103027344, "kl": 0.1805035024881363, "learning_rate": 3.85427052570685e-06, "loss": 0.0072, "reward": 6.4848246574401855, "reward_std": 2.7532103061676025, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5098246932029724, "rewards/reward_search_strategy": 0.7249999642372131, "step": 386 }, { "completion_length": 282.25, "epoch": 1.3531468531468531, "grad_norm": 1.6480878591537476, "kl": 0.17681656777858734, "learning_rate": 3.846926962517158e-06, "loss": 0.0071, "reward": 6.323261737823486, "reward_std": 2.6390492916107178, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4482613801956177, "rewards/reward_search_strategy": 0.6250000596046448, "step": 387 }, { "completion_length": 237.125, "epoch": 1.3566433566433567, "grad_norm": 3.1040449142456055, "kl": 0.48438748717308044, "learning_rate": 3.839566987447492e-06, "loss": 0.0194, "reward": 6.36116361618042, "reward_std": 2.4046599864959717, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3111635446548462, "rewards/reward_search_strategy": 0.30000001192092896, "step": 388 }, { "completion_length": 350.5, "epoch": 1.3601398601398602, "grad_norm": 1.3969569206237793, "kl": 0.14033286273479462, "learning_rate": 3.832190690176825e-06, "loss": 0.0056, "reward": 2.188044786453247, "reward_std": 1.4462171792984009, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.16304486989974976, "rewards/reward_search_strategy": 0.4000000059604645, "step": 389 }, { "completion_length": 326.125, "epoch": 1.3636363636363638, "grad_norm": 1.3200170993804932, "kl": 0.15739156305789948, "learning_rate": 3.824798160583012e-06, "loss": 0.0063, "reward": 4.655465126037598, "reward_std": 2.911059856414795, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4304652214050293, "rewards/reward_search_strategy": 0.6000000238418579, "step": 390 }, { "completion_length": 618.625, "epoch": 1.367132867132867, "grad_norm": 0.6044898629188538, "kl": 0.10862935334444046, "learning_rate": 3.817389488741694e-06, "loss": 0.0043, "reward": 5.85195255279541, "reward_std": 2.695142984390259, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5269524455070496, "rewards/reward_search_strategy": 0.824999988079071, "step": 391 }, { "completion_length": 388.125, "epoch": 1.3706293706293706, "grad_norm": 0.757124662399292, "kl": 0.11755659431219101, "learning_rate": 3.8099647649251984e-06, "loss": 0.0047, "reward": 4.8429670333862305, "reward_std": 2.5176022052764893, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.39296701550483704, "rewards/reward_search_strategy": 0.8250000476837158, "step": 392 }, { "completion_length": 383.0, "epoch": 1.3741258741258742, "grad_norm": 0.6554877758026123, "kl": 0.15485447645187378, "learning_rate": 3.802524079601442e-06, "loss": 0.0062, "reward": 3.9751625061035156, "reward_std": 2.614086151123047, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4751623272895813, "rewards/reward_search_strategy": 0.625, "step": 393 }, { "completion_length": 368.125, "epoch": 1.3776223776223775, "grad_norm": 0.6346847414970398, "kl": 0.11959197372198105, "learning_rate": 3.795067523432826e-06, "loss": 0.0048, "reward": 4.5723676681518555, "reward_std": 2.505882978439331, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.49736788868904114, "rewards/reward_search_strategy": 0.7000000476837158, "step": 394 }, { "completion_length": 249.875, "epoch": 1.381118881118881, "grad_norm": 0.7225646376609802, "kl": 0.14031687378883362, "learning_rate": 3.787595187275136e-06, "loss": 0.0056, "reward": 7.360940933227539, "reward_std": 1.6116032600402832, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.23594066500663757, "rewards/reward_search_strategy": 0.625, "step": 395 }, { "completion_length": 401.875, "epoch": 1.3846153846153846, "grad_norm": 0.7882583141326904, "kl": 0.1797572374343872, "learning_rate": 3.780107162176429e-06, "loss": 0.0072, "reward": 3.243696689605713, "reward_std": 2.612200975418091, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5436968803405762, "rewards/reward_search_strategy": 0.574999988079071, "step": 396 }, { "completion_length": 294.25, "epoch": 1.3881118881118881, "grad_norm": 3.533095598220825, "kl": 1.1287881135940552, "learning_rate": 3.772603539375929e-06, "loss": 0.0452, "reward": 2.366114616394043, "reward_std": 1.650844693183899, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4661146402359009, "rewards/reward_search_strategy": 0.40000003576278687, "step": 397 }, { "completion_length": 413.75, "epoch": 1.3916083916083917, "grad_norm": 0.672588586807251, "kl": 0.12809911370277405, "learning_rate": 3.7650844103029093e-06, "loss": 0.0051, "reward": 4.5074543952941895, "reward_std": 1.824966549873352, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6074544191360474, "rewards/reward_search_strategy": 0.6499999761581421, "step": 398 }, { "completion_length": 327.375, "epoch": 1.395104895104895, "grad_norm": 1.236580729484558, "kl": 0.19380441308021545, "learning_rate": 3.7575498665755884e-06, "loss": 0.0078, "reward": 5.028186321258545, "reward_std": 3.838682174682617, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.45318639278411865, "rewards/reward_search_strategy": 0.699999988079071, "step": 399 }, { "completion_length": 342.375, "epoch": 1.3986013986013985, "grad_norm": 0.9333245754241943, "kl": 0.20195768773555756, "learning_rate": 3.7500000000000005e-06, "loss": 0.0081, "reward": 3.8275704383850098, "reward_std": 3.027420997619629, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.502570390701294, "rewards/reward_search_strategy": 0.45000001788139343, "step": 400 }, { "completion_length": 289.625, "epoch": 1.402097902097902, "grad_norm": 0.7431657314300537, "kl": 0.1940464824438095, "learning_rate": 3.742434902568889e-06, "loss": 0.0078, "reward": 6.76303768157959, "reward_std": 1.9291017055511475, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6130377054214478, "rewards/reward_search_strategy": 0.6499999761581421, "step": 401 }, { "completion_length": 236.625, "epoch": 1.4055944055944056, "grad_norm": 1.0157201290130615, "kl": 0.28472885489463806, "learning_rate": 3.7348546664605777e-06, "loss": 0.0114, "reward": 4.573916435241699, "reward_std": 3.343165636062622, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3239161968231201, "rewards/reward_search_strategy": 0.375, "step": 402 }, { "completion_length": 370.75, "epoch": 1.4090909090909092, "grad_norm": 0.6442353129386902, "kl": 0.13476325571537018, "learning_rate": 3.7272593840378526e-06, "loss": 0.0054, "reward": 5.87241268157959, "reward_std": 2.7663140296936035, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.32241225242614746, "rewards/reward_search_strategy": 0.800000011920929, "step": 403 }, { "completion_length": 344.875, "epoch": 1.4125874125874125, "grad_norm": 0.9788044691085815, "kl": 0.16146501898765564, "learning_rate": 3.7196491478468322e-06, "loss": 0.0065, "reward": 6.843417167663574, "reward_std": 2.5316994190216064, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.44341692328453064, "rewards/reward_search_strategy": 0.7749999761581421, "step": 404 }, { "completion_length": 300.125, "epoch": 1.416083916083916, "grad_norm": 0.9080623984336853, "kl": 0.20929095149040222, "learning_rate": 3.7120240506158433e-06, "loss": 0.0084, "reward": 6.924595832824707, "reward_std": 2.5679879188537598, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29959601163864136, "rewards/reward_search_strategy": 0.625, "step": 405 }, { "completion_length": 347.875, "epoch": 1.4195804195804196, "grad_norm": 0.9922822713851929, "kl": 0.16099223494529724, "learning_rate": 3.7043841852542884e-06, "loss": 0.0064, "reward": 5.190982818603516, "reward_std": 3.008113145828247, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.41598278284072876, "rewards/reward_search_strategy": 0.5249999761581421, "step": 406 }, { "completion_length": 414.25, "epoch": 1.4230769230769231, "grad_norm": 0.8010644912719727, "kl": 0.12256429344415665, "learning_rate": 3.6967296448515176e-06, "loss": 0.0049, "reward": 5.230372428894043, "reward_std": 3.308673143386841, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5303722620010376, "rewards/reward_search_strategy": 0.5750000476837158, "step": 407 }, { "completion_length": 248.0, "epoch": 1.4265734265734267, "grad_norm": 1.6469920873641968, "kl": 0.2045002579689026, "learning_rate": 3.689060522675689e-06, "loss": 0.0082, "reward": 6.9636945724487305, "reward_std": 2.935652017593384, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.36369478702545166, "rewards/reward_search_strategy": 0.7250000238418579, "step": 408 }, { "completion_length": 426.5, "epoch": 1.43006993006993, "grad_norm": 0.7976499199867249, "kl": 0.1475878804922104, "learning_rate": 3.6813769121726356e-06, "loss": 0.0059, "reward": 3.9527182579040527, "reward_std": 2.7393836975097656, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40271830558776855, "rewards/reward_search_strategy": 0.42500001192092896, "step": 409 }, { "completion_length": 384.125, "epoch": 1.4335664335664335, "grad_norm": 1.68340265750885, "kl": 0.2281363308429718, "learning_rate": 3.6736789069647273e-06, "loss": 0.0091, "reward": 4.657155990600586, "reward_std": 3.039395332336426, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48215600848197937, "rewards/reward_search_strategy": 0.550000011920929, "step": 410 }, { "completion_length": 394.125, "epoch": 1.437062937062937, "grad_norm": 0.4666031002998352, "kl": 0.1385246068239212, "learning_rate": 3.6659666008497287e-06, "loss": 0.0055, "reward": 3.309140205383301, "reward_std": 1.4426900148391724, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.409140020608902, "rewards/reward_search_strategy": 0.6499999761581421, "step": 411 }, { "completion_length": 232.375, "epoch": 1.4405594405594406, "grad_norm": 1.0850403308868408, "kl": 0.23393404483795166, "learning_rate": 3.658240087799655e-06, "loss": 0.0094, "reward": 5.597710609436035, "reward_std": 2.9239015579223633, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3727110028266907, "rewards/reward_search_strategy": 0.5999999642372131, "step": 412 }, { "completion_length": 329.875, "epoch": 1.4440559440559442, "grad_norm": 1.1676043272018433, "kl": 0.21017572283744812, "learning_rate": 3.6504994619596295e-06, "loss": 0.0084, "reward": 7.767879486083984, "reward_std": 1.3231239318847656, "rewards/reward_correctness": 1.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49287882447242737, "rewards/reward_search_strategy": 0.6499999761581421, "step": 413 }, { "completion_length": 286.875, "epoch": 1.4475524475524475, "grad_norm": 1.0990068912506104, "kl": 0.21051634848117828, "learning_rate": 3.642744817646736e-06, "loss": 0.0084, "reward": 4.044559001922607, "reward_std": 3.1820321083068848, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.294559121131897, "rewards/reward_search_strategy": 0.375, "step": 414 }, { "completion_length": 320.75, "epoch": 1.451048951048951, "grad_norm": 1.5767216682434082, "kl": 0.18334715068340302, "learning_rate": 3.634976249348867e-06, "loss": 0.0073, "reward": 5.833085536956787, "reward_std": 3.1178810596466064, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48308563232421875, "rewards/reward_search_strategy": 0.7249999642372131, "step": 415 }, { "completion_length": 369.125, "epoch": 1.4545454545454546, "grad_norm": 1.140123963356018, "kl": 0.19512750208377838, "learning_rate": 3.627193851723577e-06, "loss": 0.0078, "reward": 2.1661739349365234, "reward_std": 1.7770532369613647, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.44117406010627747, "rewards/reward_search_strategy": 0.3500000238418579, "step": 416 }, { "completion_length": 224.75, "epoch": 1.458041958041958, "grad_norm": 0.892484724521637, "kl": 0.2562803626060486, "learning_rate": 3.6193977195969243e-06, "loss": 0.0103, "reward": 3.8352110385894775, "reward_std": 3.6014862060546875, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2602110207080841, "rewards/reward_search_strategy": 0.32500001788139343, "step": 417 }, { "completion_length": 275.0, "epoch": 1.4615384615384617, "grad_norm": 1.5628401041030884, "kl": 0.2238333523273468, "learning_rate": 3.611587947962319e-06, "loss": 0.009, "reward": 5.637851715087891, "reward_std": 3.907928466796875, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.537851870059967, "rewards/reward_search_strategy": 0.4749999940395355, "step": 418 }, { "completion_length": 542.375, "epoch": 1.465034965034965, "grad_norm": 1.147643804550171, "kl": 0.17975068092346191, "learning_rate": 3.6037646319793635e-06, "loss": 0.0072, "reward": 2.3722047805786133, "reward_std": 1.6781915426254272, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4472048282623291, "rewards/reward_search_strategy": 0.550000011920929, "step": 419 }, { "completion_length": 384.125, "epoch": 1.4685314685314685, "grad_norm": 1.0817137956619263, "kl": 0.20022520422935486, "learning_rate": 3.595927866972694e-06, "loss": 0.008, "reward": 3.1727235317230225, "reward_std": 2.7517311573028564, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5477235317230225, "rewards/reward_search_strategy": 0.5, "step": 420 }, { "completion_length": 336.875, "epoch": 1.472027972027972, "grad_norm": 0.7116675972938538, "kl": 0.1551167070865631, "learning_rate": 3.5880777484308193e-06, "loss": 0.0062, "reward": 5.459067344665527, "reward_std": 2.214207649230957, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.2840671241283417, "rewards/reward_search_strategy": 0.925000011920929, "step": 421 }, { "completion_length": 598.25, "epoch": 1.4755244755244754, "grad_norm": 0.4839644432067871, "kl": 0.2022503912448883, "learning_rate": 3.5802143720049565e-06, "loss": 0.0081, "reward": 7.548956394195557, "reward_std": 2.6123108863830566, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5989563465118408, "rewards/reward_search_strategy": 0.824999988079071, "step": 422 }, { "completion_length": 448.75, "epoch": 1.479020979020979, "grad_norm": 0.6794417500495911, "kl": 0.18201248347759247, "learning_rate": 3.5723378335078653e-06, "loss": 0.0073, "reward": 6.285902500152588, "reward_std": 2.544133424758911, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5859024524688721, "rewards/reward_search_strategy": 0.824999988079071, "step": 423 }, { "completion_length": 424.125, "epoch": 1.4825174825174825, "grad_norm": 0.7091171741485596, "kl": 0.14801469445228577, "learning_rate": 3.564448228912682e-06, "loss": 0.0059, "reward": 3.324188232421875, "reward_std": 2.133406400680542, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6241883039474487, "rewards/reward_search_strategy": 0.574999988079071, "step": 424 }, { "completion_length": 335.625, "epoch": 1.486013986013986, "grad_norm": 1.1051809787750244, "kl": 0.1744394153356552, "learning_rate": 3.556545654351749e-06, "loss": 0.007, "reward": 3.0275347232818604, "reward_std": 2.2897677421569824, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5275347232818604, "rewards/reward_search_strategy": 0.625, "step": 425 }, { "completion_length": 313.5, "epoch": 1.4895104895104896, "grad_norm": 1.1032415628433228, "kl": 0.29376858472824097, "learning_rate": 3.5486302061154433e-06, "loss": 0.0118, "reward": 4.647106170654297, "reward_std": 3.4931628704071045, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4721059799194336, "rewards/reward_search_strategy": 0.675000011920929, "step": 426 }, { "completion_length": 305.625, "epoch": 1.493006993006993, "grad_norm": 0.8230642080307007, "kl": 0.16636496782302856, "learning_rate": 3.5407019806510035e-06, "loss": 0.0067, "reward": 6.975151062011719, "reward_std": 1.4824897050857544, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.525151252746582, "rewards/reward_search_strategy": 0.824999988079071, "step": 427 }, { "completion_length": 428.5, "epoch": 1.4965034965034965, "grad_norm": 0.6625391244888306, "kl": 0.11103808134794235, "learning_rate": 3.532761074561355e-06, "loss": 0.0044, "reward": 2.761242389678955, "reward_std": 1.572056770324707, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43624237179756165, "rewards/reward_search_strategy": 0.44999998807907104, "step": 428 }, { "completion_length": 370.25, "epoch": 1.5, "grad_norm": 1.137886643409729, "kl": 0.2061176896095276, "learning_rate": 3.524807584603932e-06, "loss": 0.0082, "reward": 5.234549522399902, "reward_std": 3.020899534225464, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.434549480676651, "rewards/reward_search_strategy": 0.675000011920929, "step": 429 }, { "completion_length": 460.875, "epoch": 1.5034965034965035, "grad_norm": 0.6655659675598145, "kl": 0.13574932515621185, "learning_rate": 3.516841607689501e-06, "loss": 0.0054, "reward": 4.962242126464844, "reward_std": 2.2505006790161133, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6372420787811279, "rewards/reward_search_strategy": 0.574999988079071, "step": 430 }, { "completion_length": 449.75, "epoch": 1.506993006993007, "grad_norm": 1.6040316820144653, "kl": 0.2962697148323059, "learning_rate": 3.5088632408809757e-06, "loss": 0.0119, "reward": 2.212186574935913, "reward_std": 1.8170539140701294, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.312186598777771, "rewards/reward_search_strategy": 0.4000000059604645, "step": 431 }, { "completion_length": 239.75, "epoch": 1.5104895104895104, "grad_norm": 2.0863418579101562, "kl": 0.42183053493499756, "learning_rate": 3.5008725813922383e-06, "loss": 0.0169, "reward": 5.592342376708984, "reward_std": 3.7416441440582275, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.292342871427536, "rewards/reward_search_strategy": 0.550000011920929, "step": 432 }, { "completion_length": 215.875, "epoch": 1.513986013986014, "grad_norm": 1.3532419204711914, "kl": 0.34601929783821106, "learning_rate": 3.4928697265869516e-06, "loss": 0.0138, "reward": 2.0051121711730957, "reward_std": 1.8395854234695435, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30511218309402466, "rewards/reward_search_strategy": 0.32499998807907104, "step": 433 }, { "completion_length": 289.625, "epoch": 1.5174825174825175, "grad_norm": 2.330354928970337, "kl": 0.4995352625846863, "learning_rate": 3.4848547739773782e-06, "loss": 0.02, "reward": 2.3652281761169434, "reward_std": 1.205407977104187, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4152282774448395, "rewards/reward_search_strategy": 0.44999998807907104, "step": 434 }, { "completion_length": 480.0, "epoch": 1.5209790209790208, "grad_norm": 0.8131763935089111, "kl": 0.21656283736228943, "learning_rate": 3.476827821223184e-06, "loss": 0.0087, "reward": 3.1841320991516113, "reward_std": 2.568476438522339, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5091319680213928, "rewards/reward_search_strategy": 0.550000011920929, "step": 435 }, { "completion_length": 393.25, "epoch": 1.5244755244755246, "grad_norm": 0.9795050621032715, "kl": 0.23742111027240753, "learning_rate": 3.4687889661302577e-06, "loss": 0.0095, "reward": 5.273265361785889, "reward_std": 3.0285797119140625, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5982652306556702, "rewards/reward_search_strategy": 0.675000011920929, "step": 436 }, { "completion_length": 477.375, "epoch": 1.527972027972028, "grad_norm": 1.0040581226348877, "kl": 0.1645866483449936, "learning_rate": 3.460738306649509e-06, "loss": 0.0066, "reward": 3.343968152999878, "reward_std": 1.9579272270202637, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.26896828413009644, "rewards/reward_search_strategy": 0.699999988079071, "step": 437 }, { "completion_length": 418.375, "epoch": 1.5314685314685315, "grad_norm": 0.6381561160087585, "kl": 0.1781652718782425, "learning_rate": 3.452675940875686e-06, "loss": 0.0071, "reward": 4.517590522766113, "reward_std": 1.709311842918396, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6175904273986816, "rewards/reward_search_strategy": 0.5249999761581421, "step": 438 }, { "completion_length": 324.625, "epoch": 1.534965034965035, "grad_norm": 1.2970616817474365, "kl": 0.2414904087781906, "learning_rate": 3.4446019670461684e-06, "loss": 0.0097, "reward": 3.321227788925171, "reward_std": 1.2823264598846436, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47122782468795776, "rewards/reward_search_strategy": 0.4750000238418579, "step": 439 }, { "completion_length": 398.25, "epoch": 1.5384615384615383, "grad_norm": 1.4675123691558838, "kl": 0.30049189925193787, "learning_rate": 3.436516483539781e-06, "loss": 0.012, "reward": 3.52902889251709, "reward_std": 2.5919923782348633, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47902899980545044, "rewards/reward_search_strategy": 0.550000011920929, "step": 440 }, { "completion_length": 424.375, "epoch": 1.541958041958042, "grad_norm": 2.437140941619873, "kl": 0.4226469099521637, "learning_rate": 3.4284195888755877e-06, "loss": 0.0169, "reward": 4.794424057006836, "reward_std": 3.2916970252990723, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5194237232208252, "rewards/reward_search_strategy": 0.6499999761581421, "step": 441 }, { "completion_length": 520.125, "epoch": 1.5454545454545454, "grad_norm": 0.6724218130111694, "kl": 0.23738737404346466, "learning_rate": 3.4203113817116955e-06, "loss": 0.0095, "reward": 6.539656639099121, "reward_std": 2.3404319286346436, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4896567165851593, "rewards/reward_search_strategy": 0.550000011920929, "step": 442 }, { "completion_length": 475.625, "epoch": 1.548951048951049, "grad_norm": 0.793103039264679, "kl": 0.2378067821264267, "learning_rate": 3.412191960844049e-06, "loss": 0.0095, "reward": 3.998465061187744, "reward_std": 2.821582794189453, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5234651565551758, "rewards/reward_search_strategy": 0.6000000238418579, "step": 443 }, { "completion_length": 488.25, "epoch": 1.5524475524475525, "grad_norm": 0.6211789846420288, "kl": 0.16191114485263824, "learning_rate": 3.4040614252052305e-06, "loss": 0.0065, "reward": 4.963681221008301, "reward_std": 2.8725788593292236, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5636811256408691, "rewards/reward_search_strategy": 0.6499999761581421, "step": 444 }, { "completion_length": 266.25, "epoch": 1.5559440559440558, "grad_norm": 1.6396141052246094, "kl": 0.42981529235839844, "learning_rate": 3.39591987386325e-06, "loss": 0.0172, "reward": 3.585543155670166, "reward_std": 2.793571710586548, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5105429887771606, "rewards/reward_search_strategy": 0.5750000476837158, "step": 445 }, { "completion_length": 511.125, "epoch": 1.5594405594405596, "grad_norm": 0.6755155920982361, "kl": 0.1948806196451187, "learning_rate": 3.387767406020343e-06, "loss": 0.0078, "reward": 6.462329864501953, "reward_std": 2.490957021713257, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3623296320438385, "rewards/reward_search_strategy": 0.6000000238418579, "step": 446 }, { "completion_length": 309.0, "epoch": 1.562937062937063, "grad_norm": 0.9001058340072632, "kl": 0.23154594004154205, "learning_rate": 3.3796041210117545e-06, "loss": 0.0093, "reward": 5.0776519775390625, "reward_std": 2.2008752822875977, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5276516675949097, "rewards/reward_search_strategy": 0.550000011920929, "step": 447 }, { "completion_length": 395.75, "epoch": 1.5664335664335665, "grad_norm": 0.96751868724823, "kl": 0.24724078178405762, "learning_rate": 3.3714301183045382e-06, "loss": 0.0099, "reward": 5.352873802185059, "reward_std": 3.4447824954986572, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4778738021850586, "rewards/reward_search_strategy": 0.5, "step": 448 }, { "completion_length": 596.5, "epoch": 1.56993006993007, "grad_norm": 0.5340610146522522, "kl": 0.11072459816932678, "learning_rate": 3.3632454974963368e-06, "loss": 0.0044, "reward": 4.271090507507324, "reward_std": 0.11953616142272949, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6210907101631165, "rewards/reward_search_strategy": 0.6499999761581421, "step": 449 }, { "completion_length": 577.375, "epoch": 1.5734265734265733, "grad_norm": 0.7681328058242798, "kl": 0.1562780886888504, "learning_rate": 3.3550503583141726e-06, "loss": 0.0063, "reward": 3.9707911014556885, "reward_std": 2.6017041206359863, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3957912027835846, "rewards/reward_search_strategy": 0.5750000476837158, "step": 450 }, { "completion_length": 354.125, "epoch": 1.5769230769230769, "grad_norm": 0.7792837023735046, "kl": 0.16238777339458466, "learning_rate": 3.346844800613229e-06, "loss": 0.0065, "reward": 5.095845699310303, "reward_std": 2.282075881958008, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5708456039428711, "rewards/reward_search_strategy": 0.6499999761581421, "step": 451 }, { "completion_length": 327.5, "epoch": 1.5804195804195804, "grad_norm": 6.922494888305664, "kl": 2.356689929962158, "learning_rate": 3.338628924375638e-06, "loss": 0.0943, "reward": 4.468880653381348, "reward_std": 3.3904285430908203, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.543880820274353, "rewards/reward_search_strategy": 0.675000011920929, "step": 452 }, { "completion_length": 207.0, "epoch": 1.583916083916084, "grad_norm": 1.4627685546875, "kl": 0.6507488489151001, "learning_rate": 3.3304028297092583e-06, "loss": 0.026, "reward": 7.305324554443359, "reward_std": 1.5086084604263306, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.280324250459671, "rewards/reward_search_strategy": 0.6500000357627869, "step": 453 }, { "completion_length": 614.5, "epoch": 1.5874125874125875, "grad_norm": 1.2405743598937988, "kl": 0.33275243639945984, "learning_rate": 3.3221666168464584e-06, "loss": 0.0133, "reward": 2.3712778091430664, "reward_std": 2.258321762084961, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4462779760360718, "rewards/reward_search_strategy": 0.30000001192092896, "step": 454 }, { "completion_length": 568.625, "epoch": 1.5909090909090908, "grad_norm": 0.5220686793327332, "kl": 0.11742983013391495, "learning_rate": 3.313920386142892e-06, "loss": 0.0047, "reward": 3.185084819793701, "reward_std": 2.038414478302002, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4850848317146301, "rewards/reward_search_strategy": 0.699999988079071, "step": 455 }, { "completion_length": 622.125, "epoch": 1.5944055944055944, "grad_norm": 0.7013587355613708, "kl": 0.12143175303936005, "learning_rate": 3.3056642380762783e-06, "loss": 0.0049, "reward": 3.234304904937744, "reward_std": 2.801846742630005, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5843049883842468, "rewards/reward_search_strategy": 0.4000000059604645, "step": 456 }, { "completion_length": 509.5, "epoch": 1.597902097902098, "grad_norm": 0.5290786027908325, "kl": 0.09905970096588135, "learning_rate": 3.2973982732451753e-06, "loss": 0.004, "reward": 4.47747278213501, "reward_std": 2.0540218353271484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5774728655815125, "rewards/reward_search_strategy": 0.5249999761581421, "step": 457 }, { "completion_length": 490.125, "epoch": 1.6013986013986012, "grad_norm": 0.5015708804130554, "kl": 0.14589592814445496, "learning_rate": 3.2891225923677565e-06, "loss": 0.0058, "reward": 5.786191463470459, "reward_std": 2.496694326400757, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.43619152903556824, "rewards/reward_search_strategy": 0.8500000238418579, "step": 458 }, { "completion_length": 367.875, "epoch": 1.604895104895105, "grad_norm": 0.6818746328353882, "kl": 0.19857680797576904, "learning_rate": 3.280837296280582e-06, "loss": 0.0079, "reward": 5.759618759155273, "reward_std": 2.648275136947632, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4596188962459564, "rewards/reward_search_strategy": 0.550000011920929, "step": 459 }, { "completion_length": 492.125, "epoch": 1.6083916083916083, "grad_norm": 0.7230591177940369, "kl": 0.17880992591381073, "learning_rate": 3.272542485937369e-06, "loss": 0.0072, "reward": 5.427669525146484, "reward_std": 2.917447328567505, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.7776695489883423, "rewards/reward_search_strategy": 0.7749999761581421, "step": 460 }, { "completion_length": 462.5, "epoch": 1.6118881118881119, "grad_norm": 0.903525173664093, "kl": 0.15019932389259338, "learning_rate": 3.2642382624077647e-06, "loss": 0.006, "reward": 3.903940439224243, "reward_std": 1.8853068351745605, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5539405345916748, "rewards/reward_search_strategy": 0.6000000238418579, "step": 461 }, { "completion_length": 389.125, "epoch": 1.6153846153846154, "grad_norm": 0.7114129662513733, "kl": 0.17341738939285278, "learning_rate": 3.2559247268761117e-06, "loss": 0.0069, "reward": 2.86030912399292, "reward_std": 1.3243414163589478, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.385309100151062, "rewards/reward_search_strategy": 0.4749999940395355, "step": 462 }, { "completion_length": 497.5, "epoch": 1.6188811188811187, "grad_norm": 1.1611593961715698, "kl": 0.1519286334514618, "learning_rate": 3.247601980640217e-06, "loss": 0.0061, "reward": 5.231886386871338, "reward_std": 2.9188578128814697, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4568866789340973, "rewards/reward_search_strategy": 0.6499999761581421, "step": 463 }, { "completion_length": 307.75, "epoch": 1.6223776223776225, "grad_norm": 1.214368462562561, "kl": 0.1455107182264328, "learning_rate": 3.2392701251101172e-06, "loss": 0.0058, "reward": 1.7257366180419922, "reward_std": 1.2823498249053955, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.42573660612106323, "rewards/reward_search_strategy": 0.30000001192092896, "step": 464 }, { "completion_length": 219.25, "epoch": 1.6258741258741258, "grad_norm": 1.7830798625946045, "kl": 0.24480244517326355, "learning_rate": 3.230929261806842e-06, "loss": 0.0098, "reward": 3.739067554473877, "reward_std": 3.551278829574585, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.38906753063201904, "rewards/reward_search_strategy": 0.3499999940395355, "step": 465 }, { "completion_length": 354.25, "epoch": 1.6293706293706294, "grad_norm": 0.9129117131233215, "kl": 0.17476214468479156, "learning_rate": 3.222579492361179e-06, "loss": 0.007, "reward": 3.277841567993164, "reward_std": 1.013387680053711, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40284180641174316, "rewards/reward_search_strategy": 0.5, "step": 466 }, { "completion_length": 374.75, "epoch": 1.632867132867133, "grad_norm": 1.2731070518493652, "kl": 0.21385666728019714, "learning_rate": 3.214220918512434e-06, "loss": 0.0086, "reward": 4.312999725341797, "reward_std": 3.5308971405029297, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.23799952864646912, "rewards/reward_search_strategy": 0.44999998807907104, "step": 467 }, { "completion_length": 760.25, "epoch": 1.6363636363636362, "grad_norm": 1.427370309829712, "kl": 0.2925739288330078, "learning_rate": 3.205853642107192e-06, "loss": 0.0117, "reward": 5.048447132110596, "reward_std": 3.4149582386016846, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47344690561294556, "rewards/reward_search_strategy": 0.574999988079071, "step": 468 }, { "completion_length": 659.875, "epoch": 1.63986013986014, "grad_norm": 0.9429287314414978, "kl": 0.11566608399152756, "learning_rate": 3.1974777650980737e-06, "loss": 0.0046, "reward": 3.4420485496520996, "reward_std": 2.3602750301361084, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.742048442363739, "rewards/reward_search_strategy": 0.44999998807907104, "step": 469 }, { "completion_length": 639.75, "epoch": 1.6433566433566433, "grad_norm": 2.0382609367370605, "kl": 0.14500349760055542, "learning_rate": 3.189093389542498e-06, "loss": 0.0058, "reward": 4.069566249847412, "reward_std": 2.456049919128418, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.54456627368927, "rewards/reward_search_strategy": 0.6499999761581421, "step": 470 }, { "completion_length": 443.0, "epoch": 1.6468531468531469, "grad_norm": 0.672690749168396, "kl": 0.12307699769735336, "learning_rate": 3.180700617601436e-06, "loss": 0.0049, "reward": 5.947089195251465, "reward_std": 2.471454620361328, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4720890522003174, "rewards/reward_search_strategy": 0.4750000238418579, "step": 471 }, { "completion_length": 404.625, "epoch": 1.6503496503496504, "grad_norm": 0.76925128698349, "kl": 0.1666235327720642, "learning_rate": 3.1722995515381644e-06, "loss": 0.0067, "reward": 5.3943705558776855, "reward_std": 2.020536184310913, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5193708539009094, "rewards/reward_search_strategy": 0.625, "step": 472 }, { "completion_length": 538.0, "epoch": 1.6538461538461537, "grad_norm": 1.2525761127471924, "kl": 0.16103272140026093, "learning_rate": 3.1638902937170224e-06, "loss": 0.0064, "reward": 3.551636219024658, "reward_std": 2.179047107696533, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5516363978385925, "rewards/reward_search_strategy": 0.5, "step": 473 }, { "completion_length": 832.75, "epoch": 1.6573426573426573, "grad_norm": 0.3906629979610443, "kl": 0.08530028909444809, "learning_rate": 3.155472946602162e-06, "loss": 0.0034, "reward": 2.7689270973205566, "reward_std": 1.2982851266860962, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5689270496368408, "rewards/reward_search_strategy": 0.5750000476837158, "step": 474 }, { "completion_length": 530.25, "epoch": 1.6608391608391608, "grad_norm": 0.6616779565811157, "kl": 0.2439732700586319, "learning_rate": 3.147047612756302e-06, "loss": 0.0098, "reward": 1.7873218059539795, "reward_std": 1.3397150039672852, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.6123218536376953, "rewards/reward_search_strategy": 0.30000001192092896, "step": 475 }, { "completion_length": 371.875, "epoch": 1.6643356643356644, "grad_norm": 1.1894851922988892, "kl": 0.283910870552063, "learning_rate": 3.1386143948394764e-06, "loss": 0.0114, "reward": 3.811448812484741, "reward_std": 3.3633668422698975, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3864488899707794, "rewards/reward_search_strategy": 0.550000011920929, "step": 476 }, { "completion_length": 426.375, "epoch": 1.667832167832168, "grad_norm": 0.6441279649734497, "kl": 0.13555216789245605, "learning_rate": 3.130173395607785e-06, "loss": 0.0054, "reward": 4.598128318786621, "reward_std": 1.5403695106506348, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5481281280517578, "rewards/reward_search_strategy": 0.550000011920929, "step": 477 }, { "completion_length": 324.5, "epoch": 1.6713286713286712, "grad_norm": 2.565237283706665, "kl": 0.3227596580982208, "learning_rate": 3.121724717912138e-06, "loss": 0.0129, "reward": 4.640970230102539, "reward_std": 2.257411241531372, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.34097036719322205, "rewards/reward_search_strategy": 0.675000011920929, "step": 478 }, { "completion_length": 353.5, "epoch": 1.6748251748251748, "grad_norm": 0.9772915244102478, "kl": 0.19766215980052948, "learning_rate": 3.1132684646970068e-06, "loss": 0.0079, "reward": 6.462177276611328, "reward_std": 2.8117926120758057, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5121774077415466, "rewards/reward_search_strategy": 0.7000000476837158, "step": 479 }, { "completion_length": 353.25, "epoch": 1.6783216783216783, "grad_norm": 0.8048790097236633, "kl": 0.1595701426267624, "learning_rate": 3.1048047389991693e-06, "loss": 0.0064, "reward": 5.323690414428711, "reward_std": 2.957422971725464, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6236906051635742, "rewards/reward_search_strategy": 0.574999988079071, "step": 480 }, { "completion_length": 607.875, "epoch": 1.6818181818181817, "grad_norm": 0.6971364617347717, "kl": 0.15470711886882782, "learning_rate": 3.0963336439464527e-06, "loss": 0.0062, "reward": 4.107250690460205, "reward_std": 2.5991530418395996, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.48225075006484985, "rewards/reward_search_strategy": 0.625, "step": 481 }, { "completion_length": 505.0, "epoch": 1.6853146853146854, "grad_norm": 1.7211177349090576, "kl": 0.15958930552005768, "learning_rate": 3.087855282756475e-06, "loss": 0.0064, "reward": 3.6237030029296875, "reward_std": 1.4970024824142456, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5487030148506165, "rewards/reward_search_strategy": 0.574999988079071, "step": 482 }, { "completion_length": 410.5, "epoch": 1.6888111888111887, "grad_norm": 1.117256999015808, "kl": 0.22125303745269775, "learning_rate": 3.079369758735393e-06, "loss": 0.0089, "reward": 3.5827155113220215, "reward_std": 2.4601330757141113, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5577152967453003, "rewards/reward_search_strategy": 0.4000000059604645, "step": 483 }, { "completion_length": 405.625, "epoch": 1.6923076923076923, "grad_norm": 0.6825506687164307, "kl": 0.14363308250904083, "learning_rate": 3.0708771752766397e-06, "loss": 0.0057, "reward": 3.798617124557495, "reward_std": 0.7645675539970398, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4986169636249542, "rewards/reward_search_strategy": 0.550000011920929, "step": 484 }, { "completion_length": 474.625, "epoch": 1.6958041958041958, "grad_norm": 1.18376886844635, "kl": 0.17103073000907898, "learning_rate": 3.062377635859663e-06, "loss": 0.0068, "reward": 4.56157112121582, "reward_std": 3.211364984512329, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3865714967250824, "rewards/reward_search_strategy": 0.675000011920929, "step": 485 }, { "completion_length": 290.625, "epoch": 1.6993006993006992, "grad_norm": 1.1491068601608276, "kl": 0.2933157980442047, "learning_rate": 3.053871244048669e-06, "loss": 0.0117, "reward": 3.049312114715576, "reward_std": 2.6080663204193115, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.29931211471557617, "rewards/reward_search_strategy": 0.5, "step": 486 }, { "completion_length": 419.75, "epoch": 1.702797202797203, "grad_norm": 1.6442235708236694, "kl": 0.22408655285835266, "learning_rate": 3.045358103491357e-06, "loss": 0.009, "reward": 5.084389686584473, "reward_std": 2.9285178184509277, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.534389853477478, "rewards/reward_search_strategy": 0.550000011920929, "step": 487 }, { "completion_length": 405.25, "epoch": 1.7062937062937062, "grad_norm": 7.855199337005615, "kl": 2.285917043685913, "learning_rate": 3.0368383179176584e-06, "loss": 0.0914, "reward": 3.893965721130371, "reward_std": 3.1409003734588623, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1689659059047699, "rewards/reward_search_strategy": 0.4750000238418579, "step": 488 }, { "completion_length": 320.25, "epoch": 1.7097902097902098, "grad_norm": 0.8982135057449341, "kl": 0.25896814465522766, "learning_rate": 3.0283119911384724e-06, "loss": 0.0104, "reward": 4.6774492263793945, "reward_std": 2.9654905796051025, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4774494767189026, "rewards/reward_search_strategy": 0.574999988079071, "step": 489 }, { "completion_length": 326.75, "epoch": 1.7132867132867133, "grad_norm": 0.9028376340866089, "kl": 0.2348860800266266, "learning_rate": 3.019779227044398e-06, "loss": 0.0094, "reward": 4.503946304321289, "reward_std": 3.175311803817749, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.37894657254219055, "rewards/reward_search_strategy": 0.625, "step": 490 }, { "completion_length": 390.75, "epoch": 1.7167832167832167, "grad_norm": 0.7262877225875854, "kl": 0.1882961541414261, "learning_rate": 3.0112401296044756e-06, "loss": 0.0075, "reward": 2.486973762512207, "reward_std": 1.229779839515686, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4119737446308136, "rewards/reward_search_strategy": 0.44999998807907104, "step": 491 }, { "completion_length": 719.375, "epoch": 1.7202797202797204, "grad_norm": 0.6394368410110474, "kl": 0.13550426065921783, "learning_rate": 3.002694802864912e-06, "loss": 0.0054, "reward": 4.497361183166504, "reward_std": 2.7111971378326416, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.647361159324646, "rewards/reward_search_strategy": 0.6000000238418579, "step": 492 }, { "completion_length": 1683.875, "epoch": 1.7237762237762237, "grad_norm": 1.2878316640853882, "kl": 0.24534538388252258, "learning_rate": 2.9941433509478157e-06, "loss": 0.0098, "reward": 1.7936222553253174, "reward_std": 1.2284867763519287, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4436222314834595, "rewards/reward_search_strategy": 0.4750000238418579, "step": 493 }, { "completion_length": 369.5, "epoch": 1.7272727272727273, "grad_norm": 0.6680236458778381, "kl": 0.18054969608783722, "learning_rate": 2.98558587804993e-06, "loss": 0.0072, "reward": 3.908217430114746, "reward_std": 0.9750527143478394, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5332175493240356, "rewards/reward_search_strategy": 0.625, "step": 494 }, { "completion_length": 839.875, "epoch": 1.7307692307692308, "grad_norm": 0.6535410284996033, "kl": 0.13271893560886383, "learning_rate": 2.9770224884413625e-06, "loss": 0.0053, "reward": 2.350466728210449, "reward_std": 2.074483871459961, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5254667401313782, "rewards/reward_search_strategy": 0.32500001788139343, "step": 495 }, { "completion_length": 593.0, "epoch": 1.7342657342657342, "grad_norm": 0.8978703618049622, "kl": 0.22637799382209778, "learning_rate": 2.9684532864643123e-06, "loss": 0.0091, "reward": 4.150125503540039, "reward_std": 1.999029278755188, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5251256227493286, "rewards/reward_search_strategy": 0.625, "step": 496 }, { "completion_length": 728.625, "epoch": 1.737762237762238, "grad_norm": 0.7385975122451782, "kl": 0.17462122440338135, "learning_rate": 2.9598783765318005e-06, "loss": 0.007, "reward": 2.4210894107818604, "reward_std": 1.5048274993896484, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.49608945846557617, "rewards/reward_search_strategy": 0.550000011920929, "step": 497 }, { "completion_length": 657.875, "epoch": 1.7412587412587412, "grad_norm": 0.8704059720039368, "kl": 0.17740987241268158, "learning_rate": 2.9512978631264006e-06, "loss": 0.0071, "reward": 2.909700870513916, "reward_std": 1.5739132165908813, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5597009062767029, "rewards/reward_search_strategy": 0.4750000238418579, "step": 498 }, { "completion_length": 673.875, "epoch": 1.7447552447552448, "grad_norm": 0.7553533315658569, "kl": 0.17763131856918335, "learning_rate": 2.942711850798959e-06, "loss": 0.0071, "reward": 3.8633580207824707, "reward_std": 3.2166748046875, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6133580207824707, "rewards/reward_search_strategy": 0.75, "step": 499 }, { "completion_length": 601.0, "epoch": 1.7482517482517483, "grad_norm": 1.2557988166809082, "kl": 0.3829733729362488, "learning_rate": 2.9341204441673267e-06, "loss": 0.0153, "reward": 2.2739150524139404, "reward_std": 1.694032073020935, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3989151120185852, "rewards/reward_search_strategy": 0.375, "step": 500 }, { "completion_length": 699.5, "epoch": 1.7517482517482517, "grad_norm": 0.7813470959663391, "kl": 0.18124333024024963, "learning_rate": 2.9255237479150815e-06, "loss": 0.0072, "reward": 2.141329526901245, "reward_std": 1.5843234062194824, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5913295745849609, "rewards/reward_search_strategy": 0.30000001192092896, "step": 501 }, { "completion_length": 465.375, "epoch": 1.7552447552447552, "grad_norm": 1.2914066314697266, "kl": 0.19192466139793396, "learning_rate": 2.9169218667902562e-06, "loss": 0.0077, "reward": 2.826913356781006, "reward_std": 1.4041271209716797, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4269134998321533, "rewards/reward_search_strategy": 0.5249999761581421, "step": 502 }, { "completion_length": 369.0, "epoch": 1.7587412587412588, "grad_norm": 0.9004430770874023, "kl": 0.19282333552837372, "learning_rate": 2.908314905604056e-06, "loss": 0.0077, "reward": 4.80116081237793, "reward_std": 3.8360373973846436, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.42616114020347595, "rewards/reward_search_strategy": 0.625, "step": 503 }, { "completion_length": 729.75, "epoch": 1.762237762237762, "grad_norm": 0.9142323732376099, "kl": 0.1846940666437149, "learning_rate": 2.8997029692295875e-06, "loss": 0.0074, "reward": 3.3747310638427734, "reward_std": 2.6164660453796387, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3497309386730194, "rewards/reward_search_strategy": 0.5249999761581421, "step": 504 }, { "completion_length": 439.875, "epoch": 1.7657342657342658, "grad_norm": 0.5326440334320068, "kl": 0.21149645745754242, "learning_rate": 2.8910861626005774e-06, "loss": 0.0085, "reward": 7.152305603027344, "reward_std": 2.5300772190093994, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5273054838180542, "rewards/reward_search_strategy": 0.875, "step": 505 }, { "completion_length": 377.75, "epoch": 1.7692307692307692, "grad_norm": 1.7362279891967773, "kl": 0.20955699682235718, "learning_rate": 2.8824645907100957e-06, "loss": 0.0084, "reward": 4.904088020324707, "reward_std": 2.917093515396118, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.17908819019794464, "rewards/reward_search_strategy": 0.6000000238418579, "step": 506 }, { "completion_length": 390.625, "epoch": 1.7727272727272727, "grad_norm": 1.1178964376449585, "kl": 0.29615768790245056, "learning_rate": 2.8738383586092745e-06, "loss": 0.0118, "reward": 5.1932878494262695, "reward_std": 3.258514165878296, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4932880401611328, "rewards/reward_search_strategy": 0.45000001788139343, "step": 507 }, { "completion_length": 257.875, "epoch": 1.7762237762237763, "grad_norm": 1.231881022453308, "kl": 0.3791511356830597, "learning_rate": 2.8652075714060296e-06, "loss": 0.0152, "reward": 5.252076625823975, "reward_std": 3.364706039428711, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3520766496658325, "rewards/reward_search_strategy": 0.5249999761581421, "step": 508 }, { "completion_length": 620.125, "epoch": 1.7797202797202796, "grad_norm": 1.4653148651123047, "kl": 0.2104392945766449, "learning_rate": 2.8565723342637797e-06, "loss": 0.0084, "reward": 4.483345031738281, "reward_std": 3.653613805770874, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.433345228433609, "rewards/reward_search_strategy": 0.42500001192092896, "step": 509 }, { "completion_length": 392.0, "epoch": 1.7832167832167833, "grad_norm": 0.5499141216278076, "kl": 0.21998637914657593, "learning_rate": 2.847932752400164e-06, "loss": 0.0088, "reward": 3.7722690105438232, "reward_std": 0.9182682037353516, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.522269070148468, "rewards/reward_search_strategy": 0.5, "step": 510 }, { "completion_length": 223.5, "epoch": 1.7867132867132867, "grad_norm": 2.654902219772339, "kl": 2.7393767833709717, "learning_rate": 2.8392889310857615e-06, "loss": 0.1096, "reward": 3.992436408996582, "reward_std": 3.132476806640625, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3424362540245056, "rewards/reward_search_strategy": 0.4000000059604645, "step": 511 }, { "completion_length": 367.125, "epoch": 1.7902097902097902, "grad_norm": 1.111373782157898, "kl": 0.1881609857082367, "learning_rate": 2.8306409756428067e-06, "loss": 0.0075, "reward": 3.7057249546051025, "reward_std": 2.5695714950561523, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.33072489500045776, "rewards/reward_search_strategy": 0.5, "step": 512 }, { "completion_length": 479.5, "epoch": 1.7937062937062938, "grad_norm": 0.621070921421051, "kl": 0.17087478935718536, "learning_rate": 2.8219889914439073e-06, "loss": 0.0068, "reward": 4.0383405685424805, "reward_std": 2.0420751571655273, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.43834051489830017, "rewards/reward_search_strategy": 0.7249999642372131, "step": 513 }, { "completion_length": 456.125, "epoch": 1.797202797202797, "grad_norm": 0.5883731245994568, "kl": 0.13777440786361694, "learning_rate": 2.813333083910761e-06, "loss": 0.0055, "reward": 3.0201869010925293, "reward_std": 2.117656946182251, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5451867580413818, "rewards/reward_search_strategy": 0.4750000238418579, "step": 514 }, { "completion_length": 727.5, "epoch": 1.8006993006993008, "grad_norm": 0.5874714255332947, "kl": 0.13230465352535248, "learning_rate": 2.804673358512869e-06, "loss": 0.0053, "reward": 4.169018745422363, "reward_std": 2.8212528228759766, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.26901865005493164, "rewards/reward_search_strategy": 0.6499999761581421, "step": 515 }, { "completion_length": 535.0, "epoch": 1.8041958041958042, "grad_norm": 0.6631115674972534, "kl": 0.1421699821949005, "learning_rate": 2.7960099207662535e-06, "loss": 0.0057, "reward": 3.8943893909454346, "reward_std": 2.7256648540496826, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.444389283657074, "rewards/reward_search_strategy": 0.44999998807907104, "step": 516 }, { "completion_length": 424.125, "epoch": 1.8076923076923077, "grad_norm": 0.8100361227989197, "kl": 0.19577130675315857, "learning_rate": 2.7873428762321667e-06, "loss": 0.0078, "reward": 6.033158302307129, "reward_std": 2.6231906414031982, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4831581711769104, "rewards/reward_search_strategy": 0.6749999523162842, "step": 517 }, { "completion_length": 528.375, "epoch": 1.8111888111888113, "grad_norm": 1.45951247215271, "kl": 0.30490973591804504, "learning_rate": 2.778672330515814e-06, "loss": 0.0122, "reward": 4.320174217224121, "reward_std": 2.345180034637451, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.32017412781715393, "rewards/reward_search_strategy": 0.625, "step": 518 }, { "completion_length": 375.5, "epoch": 1.8146853146853146, "grad_norm": 1.146427869796753, "kl": 0.1250707358121872, "learning_rate": 2.769998389265057e-06, "loss": 0.005, "reward": 3.578047513961792, "reward_std": 1.96234929561615, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.45304757356643677, "rewards/reward_search_strategy": 0.75, "step": 519 }, { "completion_length": 535.25, "epoch": 1.8181818181818183, "grad_norm": 0.9977327585220337, "kl": 0.27235743403434753, "learning_rate": 2.761321158169134e-06, "loss": 0.0109, "reward": 4.564298629760742, "reward_std": 3.3091213703155518, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43929851055145264, "rewards/reward_search_strategy": 0.625, "step": 520 }, { "completion_length": 231.125, "epoch": 1.8216783216783217, "grad_norm": 1.618683934211731, "kl": 0.2755619287490845, "learning_rate": 2.752640742957366e-06, "loss": 0.011, "reward": 4.865569114685059, "reward_std": 3.661264419555664, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39056915044784546, "rewards/reward_search_strategy": 0.4750000238418579, "step": 521 }, { "completion_length": 636.625, "epoch": 1.8251748251748252, "grad_norm": 0.583243191242218, "kl": 0.16021624207496643, "learning_rate": 2.743957249397874e-06, "loss": 0.0064, "reward": 4.629208564758301, "reward_std": 2.8494086265563965, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4792088270187378, "rewards/reward_search_strategy": 0.7749999761581421, "step": 522 }, { "completion_length": 489.75, "epoch": 1.8286713286713288, "grad_norm": 0.998420000076294, "kl": 0.26722487807273865, "learning_rate": 2.7352707832962865e-06, "loss": 0.0107, "reward": 4.075039386749268, "reward_std": 1.9532568454742432, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3500395119190216, "rewards/reward_search_strategy": 0.4749999940395355, "step": 523 }, { "completion_length": 437.125, "epoch": 1.832167832167832, "grad_norm": 3.2135281562805176, "kl": 0.21277548372745514, "learning_rate": 2.726581450494451e-06, "loss": 0.0085, "reward": 2.2732386589050293, "reward_std": 1.2628512382507324, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4982386827468872, "rewards/reward_search_strategy": 0.5249999761581421, "step": 524 }, { "completion_length": 366.375, "epoch": 1.8356643356643356, "grad_norm": 1.3392611742019653, "kl": 0.24789753556251526, "learning_rate": 2.717889356869146e-06, "loss": 0.0099, "reward": 5.1830363273620605, "reward_std": 2.9729130268096924, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.25803637504577637, "rewards/reward_search_strategy": 0.550000011920929, "step": 525 }, { "completion_length": 509.875, "epoch": 1.8391608391608392, "grad_norm": 0.5417003035545349, "kl": 0.1483701765537262, "learning_rate": 2.70919460833079e-06, "loss": 0.0059, "reward": 7.1703081130981445, "reward_std": 2.587738037109375, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7453081607818604, "rewards/reward_search_strategy": 0.800000011920929, "step": 526 }, { "completion_length": 367.75, "epoch": 1.8426573426573427, "grad_norm": 0.8764938116073608, "kl": 0.2148994356393814, "learning_rate": 2.700497310822147e-06, "loss": 0.0086, "reward": 5.292134761810303, "reward_std": 3.6617748737335205, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5421345829963684, "rewards/reward_search_strategy": 0.625, "step": 527 }, { "completion_length": 278.875, "epoch": 1.8461538461538463, "grad_norm": 0.9607499837875366, "kl": 0.23589320480823517, "learning_rate": 2.6917975703170466e-06, "loss": 0.0094, "reward": 4.2339372634887695, "reward_std": 2.0283195972442627, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4589373767375946, "rewards/reward_search_strategy": 0.7749999761581421, "step": 528 }, { "completion_length": 363.75, "epoch": 1.8496503496503496, "grad_norm": 0.7568937540054321, "kl": 0.1478801816701889, "learning_rate": 2.6830954928190795e-06, "loss": 0.0059, "reward": 4.656356334686279, "reward_std": 2.3339731693267822, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4563564360141754, "rewards/reward_search_strategy": 0.699999988079071, "step": 529 }, { "completion_length": 418.125, "epoch": 1.8531468531468531, "grad_norm": 0.73731529712677, "kl": 0.1510273814201355, "learning_rate": 2.6743911843603134e-06, "loss": 0.006, "reward": 4.6169233322143555, "reward_std": 2.509160041809082, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5919233560562134, "rewards/reward_search_strategy": 0.6499999761581421, "step": 530 }, { "completion_length": 393.625, "epoch": 1.8566433566433567, "grad_norm": 0.985019326210022, "kl": 0.25836557149887085, "learning_rate": 2.6656847510000013e-06, "loss": 0.0103, "reward": 3.9243826866149902, "reward_std": 2.617119073867798, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3493826687335968, "rewards/reward_search_strategy": 0.44999998807907104, "step": 531 }, { "completion_length": 274.625, "epoch": 1.86013986013986, "grad_norm": 1.0898783206939697, "kl": 0.2563922703266144, "learning_rate": 2.6569762988232838e-06, "loss": 0.0103, "reward": 3.603743076324463, "reward_std": 3.3509886264801025, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.42874306440353394, "rewards/reward_search_strategy": 0.42500001192092896, "step": 532 }, { "completion_length": 374.25, "epoch": 1.8636363636363638, "grad_norm": 0.7089191675186157, "kl": 0.15728232264518738, "learning_rate": 2.6482659339399047e-06, "loss": 0.0063, "reward": 4.477728366851807, "reward_std": 1.6571000814437866, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.35272836685180664, "rewards/reward_search_strategy": 0.75, "step": 533 }, { "completion_length": 473.0, "epoch": 1.867132867132867, "grad_norm": 0.6451985239982605, "kl": 0.1318826675415039, "learning_rate": 2.63955376248291e-06, "loss": 0.0053, "reward": 5.787966728210449, "reward_std": 2.74002742767334, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5629667639732361, "rewards/reward_search_strategy": 0.6000000238418579, "step": 534 }, { "completion_length": 436.375, "epoch": 1.8706293706293706, "grad_norm": 1.4988572597503662, "kl": 0.18932144343852997, "learning_rate": 2.6308398906073603e-06, "loss": 0.0076, "reward": 5.46368408203125, "reward_std": 2.972785711288452, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4886842966079712, "rewards/reward_search_strategy": 0.6000000238418579, "step": 535 }, { "completion_length": 355.375, "epoch": 1.8741258741258742, "grad_norm": 0.7932916879653931, "kl": 0.2548842132091522, "learning_rate": 2.6221244244890336e-06, "loss": 0.0102, "reward": 6.01389217376709, "reward_std": 2.8930044174194336, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4138926863670349, "rewards/reward_search_strategy": 0.6000000238418579, "step": 536 }, { "completion_length": 306.125, "epoch": 1.8776223776223775, "grad_norm": 1.055395483970642, "kl": 0.30341437458992004, "learning_rate": 2.613407470323134e-06, "loss": 0.0121, "reward": 5.066512107849121, "reward_std": 2.5190961360931396, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.21651223301887512, "rewards/reward_search_strategy": 0.6000000238418579, "step": 537 }, { "completion_length": 929.25, "epoch": 1.8811188811188813, "grad_norm": 0.603378176689148, "kl": 0.1319848597049713, "learning_rate": 2.604689134322999e-06, "loss": 0.0053, "reward": 4.212712287902832, "reward_std": 2.048562526702881, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48771214485168457, "rewards/reward_search_strategy": 0.4750000238418579, "step": 538 }, { "completion_length": 777.625, "epoch": 1.8846153846153846, "grad_norm": 0.805946409702301, "kl": 0.1326817125082016, "learning_rate": 2.5959695227188e-06, "loss": 0.0053, "reward": 3.8090720176696777, "reward_std": 2.5989437103271484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6590719223022461, "rewards/reward_search_strategy": 0.5249999761581421, "step": 539 }, { "completion_length": 604.0, "epoch": 1.8881118881118881, "grad_norm": 0.7141540050506592, "kl": 0.15499770641326904, "learning_rate": 2.587248741756253e-06, "loss": 0.0062, "reward": 5.391590118408203, "reward_std": 3.8308281898498535, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4915906488895416, "rewards/reward_search_strategy": 0.6499999761581421, "step": 540 }, { "completion_length": 426.875, "epoch": 1.8916083916083917, "grad_norm": 0.6566001772880554, "kl": 0.11781422793865204, "learning_rate": 2.578526897695321e-06, "loss": 0.0047, "reward": 6.018224716186523, "reward_std": 2.3312177658081055, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.41822493076324463, "rewards/reward_search_strategy": 0.6000000238418579, "step": 541 }, { "completion_length": 311.25, "epoch": 1.895104895104895, "grad_norm": 1.291000247001648, "kl": 0.2666287422180176, "learning_rate": 2.569804096808923e-06, "loss": 0.0107, "reward": 5.1927337646484375, "reward_std": 2.755157470703125, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3927338719367981, "rewards/reward_search_strategy": 0.550000011920929, "step": 542 }, { "completion_length": 305.25, "epoch": 1.8986013986013988, "grad_norm": 0.8758834600448608, "kl": 0.21868717670440674, "learning_rate": 2.5610804453816333e-06, "loss": 0.0087, "reward": 3.3073205947875977, "reward_std": 0.9015099406242371, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5323205590248108, "rewards/reward_search_strategy": 0.6499999761581421, "step": 543 }, { "completion_length": 392.5, "epoch": 1.902097902097902, "grad_norm": 1.5083352327346802, "kl": 0.21986426413059235, "learning_rate": 2.5523560497083927e-06, "loss": 0.0088, "reward": 7.508007049560547, "reward_std": 1.417626142501831, "rewards/reward_correctness": 1.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.008006956428289413, "rewards/reward_search_strategy": 0.875, "step": 544 }, { "completion_length": 394.0, "epoch": 1.9055944055944056, "grad_norm": 1.5424015522003174, "kl": 0.17503148317337036, "learning_rate": 2.543631016093209e-06, "loss": 0.007, "reward": 4.247199058532715, "reward_std": 1.7720770835876465, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4471993148326874, "rewards/reward_search_strategy": 0.550000011920929, "step": 545 }, { "completion_length": 386.0, "epoch": 1.9090909090909092, "grad_norm": 0.7861531972885132, "kl": 0.20189355313777924, "learning_rate": 2.5349054508478636e-06, "loss": 0.0081, "reward": 5.381592750549316, "reward_std": 2.4070920944213867, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48159298300743103, "rewards/reward_search_strategy": 0.6499999761581421, "step": 546 }, { "completion_length": 340.875, "epoch": 1.9125874125874125, "grad_norm": 0.8309083580970764, "kl": 0.2448170930147171, "learning_rate": 2.526179460290615e-06, "loss": 0.0098, "reward": 3.613577365875244, "reward_std": 2.6209349632263184, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5135773420333862, "rewards/reward_search_strategy": 0.6000000238418579, "step": 547 }, { "completion_length": 663.5, "epoch": 1.916083916083916, "grad_norm": 0.5724729895591736, "kl": 0.1197650209069252, "learning_rate": 2.517453150744904e-06, "loss": 0.0048, "reward": 3.980480909347534, "reward_std": 2.2730460166931152, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.55548095703125, "rewards/reward_search_strategy": 0.675000011920929, "step": 548 }, { "completion_length": 272.75, "epoch": 1.9195804195804196, "grad_norm": 1.6417477130889893, "kl": 0.28812453150749207, "learning_rate": 2.5087266285380597e-06, "loss": 0.0115, "reward": 2.912532329559326, "reward_std": 1.6371732950210571, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.23753249645233154, "rewards/reward_search_strategy": 0.5499999523162842, "step": 549 }, { "completion_length": 305.0, "epoch": 1.9230769230769231, "grad_norm": 6.134403228759766, "kl": 0.8665672540664673, "learning_rate": 2.5e-06, "loss": 0.0347, "reward": 4.181793212890625, "reward_std": 2.829068183898926, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4317927658557892, "rewards/reward_search_strategy": 0.625, "step": 550 }, { "completion_length": 298.0, "epoch": 1.9265734265734267, "grad_norm": 0.9454349279403687, "kl": 0.11578426510095596, "learning_rate": 2.4912733714619415e-06, "loss": 0.0046, "reward": 3.1753480434417725, "reward_std": 1.6148889064788818, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6503480672836304, "rewards/reward_search_strategy": 0.5250000357627869, "step": 551 }, { "completion_length": 278.375, "epoch": 1.93006993006993, "grad_norm": 1.9904608726501465, "kl": 0.2013421654701233, "learning_rate": 2.482546849255096e-06, "loss": 0.0081, "reward": 2.3399460315704346, "reward_std": 2.094362258911133, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2149459570646286, "rewards/reward_search_strategy": 0.5, "step": 552 }, { "completion_length": 384.625, "epoch": 1.9335664335664335, "grad_norm": 0.7055719494819641, "kl": 0.16155293583869934, "learning_rate": 2.4738205397093863e-06, "loss": 0.0065, "reward": 5.295562744140625, "reward_std": 2.516092538833618, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.27056291699409485, "rewards/reward_search_strategy": 0.6499999761581421, "step": 553 }, { "completion_length": 403.375, "epoch": 1.937062937062937, "grad_norm": 0.6282983422279358, "kl": 0.10670147091150284, "learning_rate": 2.4650945491521372e-06, "loss": 0.0043, "reward": 3.8357903957366943, "reward_std": 1.950538992881775, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.11079052835702896, "rewards/reward_search_strategy": 0.7250000238418579, "step": 554 }, { "completion_length": 371.125, "epoch": 1.9405594405594404, "grad_norm": 0.9865713119506836, "kl": 0.13365398347377777, "learning_rate": 2.4563689839067913e-06, "loss": 0.0053, "reward": 4.407708168029785, "reward_std": 1.5128272771835327, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4577085077762604, "rewards/reward_search_strategy": 0.44999998807907104, "step": 555 }, { "completion_length": 343.875, "epoch": 1.9440559440559442, "grad_norm": 2.774280548095703, "kl": 0.4545011520385742, "learning_rate": 2.447643950291608e-06, "loss": 0.0182, "reward": 5.597696781158447, "reward_std": 2.1334950923919678, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5226967334747314, "rewards/reward_search_strategy": 0.5750000476837158, "step": 556 }, { "completion_length": 439.25, "epoch": 1.9475524475524475, "grad_norm": 0.7445939779281616, "kl": 0.14428403973579407, "learning_rate": 2.4389195546183676e-06, "loss": 0.0058, "reward": 3.189345598220825, "reward_std": 1.6320502758026123, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3643457889556885, "rewards/reward_search_strategy": 0.5750000476837158, "step": 557 }, { "completion_length": 406.0, "epoch": 1.951048951048951, "grad_norm": 0.6262521147727966, "kl": 0.16001485288143158, "learning_rate": 2.4301959031910785e-06, "loss": 0.0064, "reward": 5.268270969390869, "reward_std": 2.528679609298706, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.39327099919319153, "rewards/reward_search_strategy": 0.75, "step": 558 }, { "completion_length": 571.125, "epoch": 1.9545454545454546, "grad_norm": 1.8445830345153809, "kl": 0.2330249845981598, "learning_rate": 2.4214731023046795e-06, "loss": 0.0093, "reward": 2.626037120819092, "reward_std": 2.412799596786499, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3260372579097748, "rewards/reward_search_strategy": 0.42499998211860657, "step": 559 }, { "completion_length": 484.5, "epoch": 1.958041958041958, "grad_norm": 1.1257758140563965, "kl": 0.15154430270195007, "learning_rate": 2.4127512582437486e-06, "loss": 0.0061, "reward": 2.8468713760375977, "reward_std": 2.2975640296936035, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.42187127470970154, "rewards/reward_search_strategy": 0.42500001192092896, "step": 560 }, { "completion_length": 293.75, "epoch": 1.9615384615384617, "grad_norm": 3.8214211463928223, "kl": 0.45248979330062866, "learning_rate": 2.4040304772812002e-06, "loss": 0.0181, "reward": 3.833535671234131, "reward_std": 3.828199863433838, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.48353567719459534, "rewards/reward_search_strategy": 0.3500000238418579, "step": 561 }, { "completion_length": 335.5, "epoch": 1.965034965034965, "grad_norm": 1.3766762018203735, "kl": 0.2158210724592209, "learning_rate": 2.3953108656770018e-06, "loss": 0.0086, "reward": 4.993716239929199, "reward_std": 2.7581541538238525, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.018716657534241676, "rewards/reward_search_strategy": 0.7250000238418579, "step": 562 }, { "completion_length": 497.75, "epoch": 1.9685314685314685, "grad_norm": 1.5413566827774048, "kl": 0.2473594844341278, "learning_rate": 2.3865925296768658e-06, "loss": 0.0099, "reward": 2.4500536918640137, "reward_std": 1.235512137413025, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40005365014076233, "rewards/reward_search_strategy": 0.550000011920929, "step": 563 }, { "completion_length": 401.625, "epoch": 1.972027972027972, "grad_norm": 0.9273636937141418, "kl": 0.21244309842586517, "learning_rate": 2.377875575510967e-06, "loss": 0.0085, "reward": 4.193811893463135, "reward_std": 0.9206790328025818, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6438118815422058, "rewards/reward_search_strategy": 0.550000011920929, "step": 564 }, { "completion_length": 536.0, "epoch": 1.9755244755244754, "grad_norm": 1.1288378238677979, "kl": 0.22064147889614105, "learning_rate": 2.3691601093926406e-06, "loss": 0.0088, "reward": 3.3308537006378174, "reward_std": 3.206730365753174, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.38085371255874634, "rewards/reward_search_strategy": 0.32499998807907104, "step": 565 }, { "completion_length": 478.125, "epoch": 1.9790209790209792, "grad_norm": 0.6461117267608643, "kl": 0.12631751596927643, "learning_rate": 2.3604462375170905e-06, "loss": 0.0051, "reward": 2.805847406387329, "reward_std": 2.364455461502075, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.6058473587036133, "rewards/reward_search_strategy": 0.44999998807907104, "step": 566 }, { "completion_length": 229.5, "epoch": 1.9825174825174825, "grad_norm": 0.9939329624176025, "kl": 0.25228554010391235, "learning_rate": 2.3517340660600965e-06, "loss": 0.0101, "reward": 5.576767921447754, "reward_std": 3.05527663230896, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.47676780819892883, "rewards/reward_search_strategy": 0.3499999940395355, "step": 567 }, { "completion_length": 445.125, "epoch": 1.986013986013986, "grad_norm": 1.2215253114700317, "kl": 0.19178111851215363, "learning_rate": 2.3430237011767166e-06, "loss": 0.0077, "reward": 2.854430913925171, "reward_std": 2.5454559326171875, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3544308841228485, "rewards/reward_search_strategy": 0.5, "step": 568 }, { "completion_length": 453.125, "epoch": 1.9895104895104896, "grad_norm": 1.3764644861221313, "kl": 0.23443952202796936, "learning_rate": 2.3343152490000004e-06, "loss": 0.0094, "reward": 3.74816632270813, "reward_std": 2.4638051986694336, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.2731662094593048, "rewards/reward_search_strategy": 0.7249999642372131, "step": 569 }, { "completion_length": 728.375, "epoch": 1.993006993006993, "grad_norm": 0.7834612727165222, "kl": 0.17851339280605316, "learning_rate": 2.325608815639687e-06, "loss": 0.0071, "reward": 3.360065221786499, "reward_std": 2.5360162258148193, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.48506492376327515, "rewards/reward_search_strategy": 0.5, "step": 570 }, { "completion_length": 317.125, "epoch": 1.9965034965034965, "grad_norm": 0.8286842107772827, "kl": 0.2042827308177948, "learning_rate": 2.3169045071809217e-06, "loss": 0.0082, "reward": 3.511277198791504, "reward_std": 3.0144357681274414, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.38627690076828003, "rewards/reward_search_strategy": 0.625, "step": 571 }, { "completion_length": 438.0, "epoch": 2.0, "grad_norm": 1.4197484254837036, "kl": 0.22094160318374634, "learning_rate": 2.3082024296829538e-06, "loss": 0.0088, "reward": 3.823275089263916, "reward_std": 2.203106164932251, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.22327494621276855, "rewards/reward_search_strategy": 0.6000000238418579, "step": 572 }, { "completion_length": 528.125, "epoch": 2.0034965034965033, "grad_norm": 1.1150870323181152, "kl": 0.16079114377498627, "learning_rate": 2.2995026891778533e-06, "loss": 0.0064, "reward": 3.081960916519165, "reward_std": 2.8381266593933105, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4069609045982361, "rewards/reward_search_strategy": 0.42500001192092896, "step": 573 }, { "completion_length": 447.375, "epoch": 2.006993006993007, "grad_norm": 2.044112205505371, "kl": 0.23929806053638458, "learning_rate": 2.290805391669212e-06, "loss": 0.0096, "reward": 3.1448910236358643, "reward_std": 1.6620686054229736, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4448910355567932, "rewards/reward_search_strategy": 0.574999988079071, "step": 574 }, { "completion_length": 344.125, "epoch": 2.0104895104895104, "grad_norm": 1.1256439685821533, "kl": 0.40603435039520264, "learning_rate": 2.2821106431308546e-06, "loss": 0.0162, "reward": 3.590481996536255, "reward_std": 3.215083360671997, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3404819071292877, "rewards/reward_search_strategy": 0.5, "step": 575 }, { "completion_length": 365.875, "epoch": 2.013986013986014, "grad_norm": 0.9045865535736084, "kl": 0.21638526022434235, "learning_rate": 2.2734185495055503e-06, "loss": 0.0087, "reward": 5.558139324188232, "reward_std": 2.632394552230835, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6581395864486694, "rewards/reward_search_strategy": 0.6499999761581421, "step": 576 }, { "completion_length": 362.0, "epoch": 2.0174825174825175, "grad_norm": 1.3176411390304565, "kl": 0.29657936096191406, "learning_rate": 2.2647292167037143e-06, "loss": 0.0119, "reward": 3.193131685256958, "reward_std": 3.028653621673584, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39313167333602905, "rewards/reward_search_strategy": 0.42500001192092896, "step": 577 }, { "completion_length": 613.375, "epoch": 2.020979020979021, "grad_norm": 0.735613226890564, "kl": 0.17025895416736603, "learning_rate": 2.256042750602127e-06, "loss": 0.0068, "reward": 4.768100261688232, "reward_std": 1.8358144760131836, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.643100380897522, "rewards/reward_search_strategy": 0.625, "step": 578 }, { "completion_length": 657.125, "epoch": 2.0244755244755246, "grad_norm": 0.9518574476242065, "kl": 0.14201372861862183, "learning_rate": 2.2473592570426343e-06, "loss": 0.0057, "reward": 4.414166450500488, "reward_std": 2.766263484954834, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.1641668975353241, "rewards/reward_search_strategy": 0.5, "step": 579 }, { "completion_length": 519.875, "epoch": 2.027972027972028, "grad_norm": 1.9180530309677124, "kl": 0.39803510904312134, "learning_rate": 2.238678841830867e-06, "loss": 0.0159, "reward": 2.850477695465088, "reward_std": 2.8477377891540527, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4254775941371918, "rewards/reward_search_strategy": 0.30000001192092896, "step": 580 }, { "completion_length": 263.125, "epoch": 2.0314685314685317, "grad_norm": 1.2091302871704102, "kl": 0.26268723607063293, "learning_rate": 2.230001610734943e-06, "loss": 0.0105, "reward": 3.0431175231933594, "reward_std": 0.7641079425811768, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.568117618560791, "rewards/reward_search_strategy": 0.7249999642372131, "step": 581 }, { "completion_length": 236.625, "epoch": 2.034965034965035, "grad_norm": 1.5325607061386108, "kl": 0.44615864753723145, "learning_rate": 2.2213276694841866e-06, "loss": 0.0178, "reward": 3.520362377166748, "reward_std": 3.250920295715332, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.22036239504814148, "rewards/reward_search_strategy": 0.42500001192092896, "step": 582 }, { "completion_length": 511.875, "epoch": 2.0384615384615383, "grad_norm": 0.9509716629981995, "kl": 0.23918908834457397, "learning_rate": 2.212657123767834e-06, "loss": 0.0096, "reward": 5.3234076499938965, "reward_std": 3.40258526802063, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29840800166130066, "rewards/reward_search_strategy": 0.6499999761581421, "step": 583 }, { "completion_length": 608.5, "epoch": 2.041958041958042, "grad_norm": 0.7608333230018616, "kl": 0.17903365194797516, "learning_rate": 2.2039900792337477e-06, "loss": 0.0072, "reward": 3.5384345054626465, "reward_std": 0.9120908379554749, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6134345531463623, "rewards/reward_search_strategy": 0.675000011920929, "step": 584 }, { "completion_length": 319.125, "epoch": 2.0454545454545454, "grad_norm": 1.3616375923156738, "kl": 0.3396897614002228, "learning_rate": 2.195326641487132e-06, "loss": 0.0136, "reward": 3.8816871643066406, "reward_std": 3.524160623550415, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4816873073577881, "rewards/reward_search_strategy": 0.5249999761581421, "step": 585 }, { "completion_length": 298.625, "epoch": 2.0489510489510487, "grad_norm": 1.714475154876709, "kl": 0.467647522687912, "learning_rate": 2.186666916089239e-06, "loss": 0.0187, "reward": 3.5826728343963623, "reward_std": 3.505326747894287, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3826729357242584, "rewards/reward_search_strategy": 0.44999998807907104, "step": 586 }, { "completion_length": 329.0, "epoch": 2.0524475524475525, "grad_norm": 2.293823480606079, "kl": 0.3535889685153961, "learning_rate": 2.1780110085560935e-06, "loss": 0.0141, "reward": 3.534921169281006, "reward_std": 2.731309652328491, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40992122888565063, "rewards/reward_search_strategy": 0.5, "step": 587 }, { "completion_length": 305.0, "epoch": 2.055944055944056, "grad_norm": 1.2662256956100464, "kl": 0.56394362449646, "learning_rate": 2.1693590243571937e-06, "loss": 0.0226, "reward": 3.4940059185028076, "reward_std": 2.9306962490081787, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.19400593638420105, "rewards/reward_search_strategy": 0.42500001192092896, "step": 588 }, { "completion_length": 755.75, "epoch": 2.0594405594405596, "grad_norm": 0.6191352605819702, "kl": 0.21323426067829132, "learning_rate": 2.1607110689142393e-06, "loss": 0.0085, "reward": 3.349336624145508, "reward_std": 2.5924930572509766, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3493368625640869, "rewards/reward_search_strategy": 0.375, "step": 589 }, { "completion_length": 618.375, "epoch": 2.062937062937063, "grad_norm": 1.132805585861206, "kl": 0.39246758818626404, "learning_rate": 2.1520672475998374e-06, "loss": 0.0157, "reward": 2.791980743408203, "reward_std": 1.946831226348877, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.5919808149337769, "rewards/reward_search_strategy": 0.45000001788139343, "step": 590 }, { "completion_length": 1027.625, "epoch": 2.0664335664335662, "grad_norm": 0.8593956828117371, "kl": 0.24245135486125946, "learning_rate": 2.143427665736221e-06, "loss": 0.0097, "reward": 2.7288267612457275, "reward_std": 2.470308303833008, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.5038267970085144, "rewards/reward_search_strategy": 0.3500000238418579, "step": 591 }, { "completion_length": 446.75, "epoch": 2.06993006993007, "grad_norm": 2.004469633102417, "kl": 0.5076056122779846, "learning_rate": 2.134792428593971e-06, "loss": 0.0203, "reward": 4.7919745445251465, "reward_std": 2.649038791656494, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4669745862483978, "rewards/reward_search_strategy": 0.45000001788139343, "step": 592 }, { "completion_length": 299.625, "epoch": 2.0734265734265733, "grad_norm": 1.6865434646606445, "kl": 0.4611359238624573, "learning_rate": 2.1261616413907267e-06, "loss": 0.0184, "reward": 2.921395778656006, "reward_std": 2.20475697517395, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4463959336280823, "rewards/reward_search_strategy": 0.4749999940395355, "step": 593 }, { "completion_length": 246.75, "epoch": 2.076923076923077, "grad_norm": 1.3372602462768555, "kl": 0.3958607316017151, "learning_rate": 2.117535409289905e-06, "loss": 0.0158, "reward": 3.5213401317596436, "reward_std": 3.8686110973358154, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.37134024500846863, "rewards/reward_search_strategy": 0.2750000059604645, "step": 594 }, { "completion_length": 248.375, "epoch": 2.0804195804195804, "grad_norm": 1.687395453453064, "kl": 0.48411720991134644, "learning_rate": 2.1089138373994226e-06, "loss": 0.0194, "reward": 2.76412034034729, "reward_std": 2.2025575637817383, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3891204297542572, "rewards/reward_search_strategy": 0.375, "step": 595 }, { "completion_length": 440.375, "epoch": 2.0839160839160837, "grad_norm": 5.519715309143066, "kl": 2.0341320037841797, "learning_rate": 2.1002970307704134e-06, "loss": 0.0814, "reward": 2.955754041671753, "reward_std": 2.849426031112671, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.18075412511825562, "rewards/reward_search_strategy": 0.4000000059604645, "step": 596 }, { "completion_length": 457.25, "epoch": 2.0874125874125875, "grad_norm": 0.6264410018920898, "kl": 0.43496525287628174, "learning_rate": 2.0916850943959453e-06, "loss": 0.0174, "reward": 3.4208319187164307, "reward_std": 1.2662279605865479, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4708319306373596, "rewards/reward_search_strategy": 0.44999998807907104, "step": 597 }, { "completion_length": 537.75, "epoch": 2.090909090909091, "grad_norm": 0.9656305909156799, "kl": 0.3533865511417389, "learning_rate": 2.0830781332097446e-06, "loss": 0.0141, "reward": 2.179312229156494, "reward_std": 1.0181849002838135, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.32931211590766907, "rewards/reward_search_strategy": 0.4749999940395355, "step": 598 }, { "completion_length": 683.375, "epoch": 2.0944055944055946, "grad_norm": 1.1000615358352661, "kl": 0.28707969188690186, "learning_rate": 2.0744762520849193e-06, "loss": 0.0115, "reward": 2.6007080078125, "reward_std": 2.0923187732696533, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6257079243659973, "rewards/reward_search_strategy": 0.3500000238418579, "step": 599 }, { "completion_length": 278.25, "epoch": 2.097902097902098, "grad_norm": 0.84709233045578, "kl": 0.35083308815956116, "learning_rate": 2.0658795558326745e-06, "loss": 0.014, "reward": 4.950100898742676, "reward_std": 2.8403358459472656, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.45010069012641907, "rewards/reward_search_strategy": 0.3750000298023224, "step": 600 }, { "completion_length": 620.375, "epoch": 2.1013986013986012, "grad_norm": 2.721386671066284, "kl": 0.4919606149196625, "learning_rate": 2.0572881492010423e-06, "loss": 0.0197, "reward": 1.8211491107940674, "reward_std": 1.338912010192871, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.396149218082428, "rewards/reward_search_strategy": 0.42500001192092896, "step": 601 }, { "completion_length": 429.875, "epoch": 2.104895104895105, "grad_norm": 2.013206720352173, "kl": 0.815765380859375, "learning_rate": 2.0487021368736002e-06, "loss": 0.0326, "reward": 1.7140884399414062, "reward_std": 1.3734639883041382, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3640882968902588, "rewards/reward_search_strategy": 0.3500000238418579, "step": 602 }, { "completion_length": 244.25, "epoch": 2.1083916083916083, "grad_norm": 1.3675538301467896, "kl": 0.43126821517944336, "learning_rate": 2.0401216234682e-06, "loss": 0.0173, "reward": 3.1364965438842773, "reward_std": 1.2145814895629883, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43649643659591675, "rewards/reward_search_strategy": 0.45000001788139343, "step": 603 }, { "completion_length": 665.625, "epoch": 2.111888111888112, "grad_norm": 0.9919797778129578, "kl": 0.24450455605983734, "learning_rate": 2.031546713535688e-06, "loss": 0.0098, "reward": 4.791861534118652, "reward_std": 3.1218600273132324, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.7168612480163574, "rewards/reward_search_strategy": 0.32500001788139343, "step": 604 }, { "completion_length": 439.125, "epoch": 2.1153846153846154, "grad_norm": 2.6146435737609863, "kl": 1.0374516248703003, "learning_rate": 2.022977511558638e-06, "loss": 0.0415, "reward": 2.1190848350524902, "reward_std": 1.5598971843719482, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.19408495724201202, "rewards/reward_search_strategy": 0.17500001192092896, "step": 605 }, { "completion_length": 211.75, "epoch": 2.1188811188811187, "grad_norm": 1.2216012477874756, "kl": 0.6190358996391296, "learning_rate": 2.0144141219500707e-06, "loss": 0.0248, "reward": 1.935401439666748, "reward_std": 1.185782551765442, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.28540149331092834, "rewards/reward_search_strategy": 0.40000003576278687, "step": 606 }, { "completion_length": 123.25, "epoch": 2.1223776223776225, "grad_norm": 2.1749942302703857, "kl": 0.8067781925201416, "learning_rate": 2.0058566490521848e-06, "loss": 0.0323, "reward": 1.869102954864502, "reward_std": 2.623877763748169, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.2941029667854309, "rewards/reward_search_strategy": 0.20000000298023224, "step": 607 }, { "completion_length": 301.5, "epoch": 2.125874125874126, "grad_norm": 3.136425733566284, "kl": 0.8475527763366699, "learning_rate": 1.997305197135089e-06, "loss": 0.0339, "reward": 1.8826358318328857, "reward_std": 2.2515580654144287, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.15763583779335022, "rewards/reward_search_strategy": 0.22500000894069672, "step": 608 }, { "completion_length": 524.125, "epoch": 2.129370629370629, "grad_norm": 2.439591407775879, "kl": 0.6721910834312439, "learning_rate": 1.9887598703955244e-06, "loss": 0.0269, "reward": 0.996848464012146, "reward_std": 1.0712072849273682, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.3468484580516815, "rewards/reward_search_strategy": 0.15000000596046448, "step": 609 }, { "completion_length": 588.125, "epoch": 2.132867132867133, "grad_norm": 1.9819761514663696, "kl": 0.3836514353752136, "learning_rate": 1.9802207729556023e-06, "loss": 0.0153, "reward": 2.9225618839263916, "reward_std": 1.442522644996643, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5475618243217468, "rewards/reward_search_strategy": 0.25, "step": 610 }, { "completion_length": 324.125, "epoch": 2.1363636363636362, "grad_norm": 1.9776884317398071, "kl": 1.3968545198440552, "learning_rate": 1.971688008861529e-06, "loss": 0.0559, "reward": 1.0356595516204834, "reward_std": 1.2683775424957275, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.3356595039367676, "rewards/reward_search_strategy": 0.20000000298023224, "step": 611 }, { "completion_length": 682.875, "epoch": 2.13986013986014, "grad_norm": 0.6471878290176392, "kl": 0.2545889914035797, "learning_rate": 1.963161682082342e-06, "loss": 0.0102, "reward": 2.734546661376953, "reward_std": 1.6279962062835693, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3595465421676636, "rewards/reward_search_strategy": 0.375, "step": 612 }, { "completion_length": 239.25, "epoch": 2.1433566433566433, "grad_norm": 1.6059681177139282, "kl": 0.6571323275566101, "learning_rate": 1.9546418965086444e-06, "loss": 0.0263, "reward": 2.048656463623047, "reward_std": 1.680714726448059, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3236566185951233, "rewards/reward_search_strategy": 0.3500000238418579, "step": 613 }, { "completion_length": 247.5, "epoch": 2.1468531468531467, "grad_norm": 2.003173351287842, "kl": 0.8130755424499512, "learning_rate": 1.946128755951332e-06, "loss": 0.0325, "reward": 1.757265567779541, "reward_std": 1.6534045934677124, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30726563930511475, "rewards/reward_search_strategy": 0.32500001788139343, "step": 614 }, { "completion_length": 636.125, "epoch": 2.1503496503496504, "grad_norm": 1.3049726486206055, "kl": 0.8581527471542358, "learning_rate": 1.937622364140338e-06, "loss": 0.0343, "reward": 1.4249939918518066, "reward_std": 1.3708144426345825, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.24999403953552246, "rewards/reward_search_strategy": 0.17500001192092896, "step": 615 }, { "completion_length": 1154.375, "epoch": 2.1538461538461537, "grad_norm": 0.4864519238471985, "kl": 0.18407891690731049, "learning_rate": 1.9291228247233607e-06, "loss": 0.0074, "reward": 1.4323234558105469, "reward_std": 1.0012611150741577, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.40732336044311523, "rewards/reward_search_strategy": 0.2750000059604645, "step": 616 }, { "completion_length": 104.75, "epoch": 2.1573426573426575, "grad_norm": 3.3319404125213623, "kl": 1.3707588911056519, "learning_rate": 1.9206302412646074e-06, "loss": 0.0548, "reward": 3.4996678829193115, "reward_std": 3.602128028869629, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2746679186820984, "rewards/reward_search_strategy": 0.22500000894069672, "step": 617 }, { "completion_length": 1008.625, "epoch": 2.160839160839161, "grad_norm": 0.85741126537323, "kl": 0.5215395092964172, "learning_rate": 1.912144717243525e-06, "loss": 0.0209, "reward": 2.4926795959472656, "reward_std": 2.4891631603240967, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.5426796674728394, "rewards/reward_search_strategy": 0.32499998807907104, "step": 618 }, { "completion_length": 742.375, "epoch": 2.164335664335664, "grad_norm": 0.8654571175575256, "kl": 0.3221873342990875, "learning_rate": 1.9036663560535484e-06, "loss": 0.0129, "reward": 2.079622268676758, "reward_std": 1.5551748275756836, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3046221137046814, "rewards/reward_search_strategy": 0.2750000059604645, "step": 619 }, { "completion_length": 457.0, "epoch": 2.167832167832168, "grad_norm": 1.6892919540405273, "kl": 0.6353630423545837, "learning_rate": 1.895195261000831e-06, "loss": 0.0254, "reward": 0.9319507479667664, "reward_std": 1.268846035003662, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.2819507420063019, "rewards/reward_search_strategy": 0.15000000596046448, "step": 620 }, { "completion_length": 776.75, "epoch": 2.1713286713286712, "grad_norm": 2.699998140335083, "kl": 0.5228149890899658, "learning_rate": 1.8867315353029937e-06, "loss": 0.0209, "reward": 1.3579305410385132, "reward_std": 1.0180671215057373, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3579305112361908, "rewards/reward_search_strategy": 0.25, "step": 621 }, { "completion_length": 335.25, "epoch": 2.174825174825175, "grad_norm": 2.1976499557495117, "kl": 0.7445406913757324, "learning_rate": 1.8782752820878636e-06, "loss": 0.0298, "reward": 1.4963700771331787, "reward_std": 0.7110073566436768, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.4213700294494629, "rewards/reward_search_strategy": 0.07500000298023224, "step": 622 }, { "completion_length": 475.5, "epoch": 2.1783216783216783, "grad_norm": 1.735268473625183, "kl": 0.5818000435829163, "learning_rate": 1.8698266043922159e-06, "loss": 0.0233, "reward": 1.7498990297317505, "reward_std": 1.372759222984314, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4998989403247833, "rewards/reward_search_strategy": 0.25, "step": 623 }, { "completion_length": 504.0, "epoch": 2.1818181818181817, "grad_norm": 3.104727268218994, "kl": 0.8847209215164185, "learning_rate": 1.8613856051605242e-06, "loss": 0.0354, "reward": 3.2298035621643066, "reward_std": 0.9418405294418335, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4548037052154541, "rewards/reward_search_strategy": 0.15000000596046448, "step": 624 }, { "completion_length": 709.875, "epoch": 2.1853146853146854, "grad_norm": 0.9933962225914001, "kl": 0.35580018162727356, "learning_rate": 1.852952387243698e-06, "loss": 0.0142, "reward": 2.2982277870178223, "reward_std": 1.8821719884872437, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3982280194759369, "rewards/reward_search_strategy": 0.2750000059604645, "step": 625 }, { "completion_length": 329.625, "epoch": 2.1888111888111887, "grad_norm": 5.319577693939209, "kl": 3.290881872177124, "learning_rate": 1.8445270533978387e-06, "loss": 0.1316, "reward": 0.805007815361023, "reward_std": 0.9074010252952576, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.25500786304473877, "rewards/reward_search_strategy": 0.05000000074505806, "step": 626 }, { "completion_length": 699.5, "epoch": 2.1923076923076925, "grad_norm": 1.924681305885315, "kl": 1.104554533958435, "learning_rate": 1.836109706282978e-06, "loss": 0.0442, "reward": 1.0210304260253906, "reward_std": 1.3331161737442017, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.22103038430213928, "rewards/reward_search_strategy": 0.05000000074505806, "step": 627 }, { "completion_length": 568.25, "epoch": 2.195804195804196, "grad_norm": 2.6070520877838135, "kl": 0.5881224274635315, "learning_rate": 1.827700448461836e-06, "loss": 0.0235, "reward": 2.8544085025787354, "reward_std": 2.1881022453308105, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4044085443019867, "rewards/reward_search_strategy": 0.32500001788139343, "step": 628 }, { "completion_length": 1027.25, "epoch": 2.199300699300699, "grad_norm": 2.4598379135131836, "kl": 1.632110834121704, "learning_rate": 1.8192993823985643e-06, "loss": 0.0653, "reward": 0.9249999523162842, "reward_std": 2.6162948608398438, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.02500000037252903, "rewards/reward_search_strategy": 0.02500000037252903, "step": 629 }, { "completion_length": 627.0, "epoch": 2.202797202797203, "grad_norm": 2.1487743854522705, "kl": 1.5370110273361206, "learning_rate": 1.8109066104575023e-06, "loss": 0.0615, "reward": 1.460218906402588, "reward_std": 1.470046877861023, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.2602188289165497, "rewards/reward_search_strategy": 0.07500000298023224, "step": 630 }, { "completion_length": 869.375, "epoch": 2.2062937062937062, "grad_norm": 1.0303807258605957, "kl": 0.21061992645263672, "learning_rate": 1.8025222349019273e-06, "loss": 0.0084, "reward": 1.3355481624603271, "reward_std": 1.430586338043213, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.28554826974868774, "rewards/reward_search_strategy": 0.05000000074505806, "step": 631 }, { "completion_length": 728.625, "epoch": 2.20979020979021, "grad_norm": 0.9784128665924072, "kl": 0.28733620047569275, "learning_rate": 1.7941463578928088e-06, "loss": 0.0115, "reward": 1.639373779296875, "reward_std": 1.9725110530853271, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2893737554550171, "rewards/reward_search_strategy": 0.22500000894069672, "step": 632 }, { "completion_length": 781.375, "epoch": 2.2132867132867133, "grad_norm": 1.5272443294525146, "kl": 0.4059949517250061, "learning_rate": 1.7857790814875665e-06, "loss": 0.0162, "reward": 2.185642719268799, "reward_std": 1.6133114099502563, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2856428325176239, "rewards/reward_search_strategy": 0.15000000596046448, "step": 633 }, { "completion_length": 876.75, "epoch": 2.2167832167832167, "grad_norm": 3.8766872882843018, "kl": 1.414270281791687, "learning_rate": 1.7774205076388207e-06, "loss": 0.0566, "reward": 1.3451478481292725, "reward_std": 2.013153314590454, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.17014791071414948, "rewards/reward_search_strategy": 0.05000000074505806, "step": 634 }, { "completion_length": 698.0, "epoch": 2.2202797202797204, "grad_norm": 1.516858458518982, "kl": 0.39291489124298096, "learning_rate": 1.7690707381931585e-06, "loss": 0.0157, "reward": 0.7239600419998169, "reward_std": 1.0574333667755127, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.0989600345492363, "rewards/reward_search_strategy": 0.125, "step": 635 }, { "completion_length": 286.25, "epoch": 2.2237762237762237, "grad_norm": 1.2062504291534424, "kl": 0.4344251751899719, "learning_rate": 1.7607298748898844e-06, "loss": 0.0174, "reward": 1.9949274063110352, "reward_std": 1.525809645652771, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3449275493621826, "rewards/reward_search_strategy": 0.15000000596046448, "step": 636 }, { "completion_length": 398.125, "epoch": 2.227272727272727, "grad_norm": 3.9527926445007324, "kl": 0.7711962461471558, "learning_rate": 1.7523980193597837e-06, "loss": 0.0308, "reward": 1.9532475471496582, "reward_std": 1.5099598169326782, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.32824742794036865, "rewards/reward_search_strategy": 0.125, "step": 637 }, { "completion_length": 177.125, "epoch": 2.230769230769231, "grad_norm": 2.8524467945098877, "kl": 0.6512960195541382, "learning_rate": 1.744075273123889e-06, "loss": 0.0261, "reward": 2.0816469192504883, "reward_std": 1.550155520439148, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.256646990776062, "rewards/reward_search_strategy": 0.20000001788139343, "step": 638 }, { "completion_length": 833.25, "epoch": 2.234265734265734, "grad_norm": 0.5548487305641174, "kl": 0.31493040919303894, "learning_rate": 1.735761737592236e-06, "loss": 0.0126, "reward": 2.217541217803955, "reward_std": 1.338832974433899, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4675411880016327, "rewards/reward_search_strategy": 0.125, "step": 639 }, { "completion_length": 571.25, "epoch": 2.237762237762238, "grad_norm": 3.3657655715942383, "kl": 1.0643417835235596, "learning_rate": 1.7274575140626318e-06, "loss": 0.0426, "reward": 1.813431978225708, "reward_std": 2.622892141342163, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.11343199759721756, "rewards/reward_search_strategy": 0.07500000298023224, "step": 640 }, { "completion_length": 188.5, "epoch": 2.2412587412587412, "grad_norm": 2.92663311958313, "kl": 1.4145941734313965, "learning_rate": 1.7191627037194187e-06, "loss": 0.0566, "reward": 1.5885727405548096, "reward_std": 1.5621074438095093, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2385728508234024, "rewards/reward_search_strategy": 0.10000000149011612, "step": 641 }, { "completion_length": 718.25, "epoch": 2.2447552447552446, "grad_norm": 1.4574460983276367, "kl": 0.43313488364219666, "learning_rate": 1.7108774076322443e-06, "loss": 0.0173, "reward": 2.001852035522461, "reward_std": 1.1423044204711914, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3768518567085266, "rewards/reward_search_strategy": 0.125, "step": 642 }, { "completion_length": 442.25, "epoch": 2.2482517482517483, "grad_norm": 1.1469643115997314, "kl": 0.6819137930870056, "learning_rate": 1.702601726754825e-06, "loss": 0.0273, "reward": 1.0776560306549072, "reward_std": 1.2006266117095947, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.22765597701072693, "rewards/reward_search_strategy": 0.10000000149011612, "step": 643 }, { "completion_length": 386.125, "epoch": 2.2517482517482517, "grad_norm": 3.1043453216552734, "kl": 1.2604275941848755, "learning_rate": 1.6943357619237227e-06, "loss": 0.0504, "reward": 1.6263604164123535, "reward_std": 1.5263493061065674, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.1513603925704956, "rewards/reward_search_strategy": 0.10000000149011612, "step": 644 }, { "completion_length": 394.75, "epoch": 2.2552447552447554, "grad_norm": 2.7489726543426514, "kl": 0.6823881268501282, "learning_rate": 1.686079613857109e-06, "loss": 0.0273, "reward": 2.122854232788086, "reward_std": 1.7154755592346191, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2478543519973755, "rewards/reward_search_strategy": 0.125, "step": 645 }, { "completion_length": 578.25, "epoch": 2.2587412587412588, "grad_norm": 1.5313423871994019, "kl": 0.47184064984321594, "learning_rate": 1.677833383153542e-06, "loss": 0.0189, "reward": 2.3924877643585205, "reward_std": 1.742182970046997, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4424876868724823, "rewards/reward_search_strategy": 0.20000001788139343, "step": 646 }, { "completion_length": 731.125, "epoch": 2.262237762237762, "grad_norm": 0.7179574370384216, "kl": 0.28321006894111633, "learning_rate": 1.6695971702907425e-06, "loss": 0.0113, "reward": 3.0394439697265625, "reward_std": 1.496739149093628, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5394439697265625, "rewards/reward_search_strategy": 0.125, "step": 647 }, { "completion_length": 424.0, "epoch": 2.265734265734266, "grad_norm": 1.839329481124878, "kl": 0.757418155670166, "learning_rate": 1.661371075624363e-06, "loss": 0.0303, "reward": 1.5853004455566406, "reward_std": 1.868477463722229, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2353004813194275, "rewards/reward_search_strategy": 0.10000000149011612, "step": 648 }, { "completion_length": 168.125, "epoch": 2.269230769230769, "grad_norm": 3.7892937660217285, "kl": 0.8482879996299744, "learning_rate": 1.6531551993867717e-06, "loss": 0.0339, "reward": 2.2620646953582764, "reward_std": 1.6340628862380981, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.0870647132396698, "rewards/reward_search_strategy": 0.17500001192092896, "step": 649 }, { "completion_length": 342.875, "epoch": 2.2727272727272725, "grad_norm": 1.6862787008285522, "kl": 0.7084638476371765, "learning_rate": 1.6449496416858285e-06, "loss": 0.0283, "reward": 2.0570688247680664, "reward_std": 1.0292710065841675, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3320687413215637, "rewards/reward_search_strategy": 0.22499999403953552, "step": 650 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }