{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.797202797202797, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 145.875, "epoch": 0.0034965034965034965, "grad_norm": 8.602514266967773, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 0.7051772475242615, "reward_std": 0.8263505697250366, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3301772475242615, "rewards/reward_search_strategy": 0.0, "step": 1 }, { "completion_length": 88.125, "epoch": 0.006993006993006993, "grad_norm": 1.4985997676849365, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": -0.0, "reward": 1.8388628959655762, "reward_std": 2.5329272747039795, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2388630509376526, "rewards/reward_search_strategy": 0.10000000149011612, "step": 2 }, { "completion_length": 296.5, "epoch": 0.01048951048951049, "grad_norm": 0.7674520611763, "kl": 0.0006381775019690394, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 1.654070258140564, "reward_std": 2.827007532119751, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.17907029390335083, "rewards/reward_search_strategy": 0.22499999403953552, "step": 3 }, { "completion_length": 290.125, "epoch": 0.013986013986013986, "grad_norm": 1.0457236766815186, "kl": 0.0007995082996785641, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 3.421433448791504, "reward_std": 2.9866631031036377, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.396433562040329, "rewards/reward_search_strategy": 0.2750000059604645, "step": 4 }, { "completion_length": 259.125, "epoch": 0.017482517482517484, "grad_norm": 1.3121755123138428, "kl": 0.000728036102373153, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.9268761873245239, "reward_std": 0.7560767531394958, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2268761545419693, "rewards/reward_search_strategy": 0.07500000298023224, "step": 5 }, { "completion_length": 383.5, "epoch": 0.02097902097902098, "grad_norm": 0.9658453464508057, "kl": 0.0007470359560102224, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 1.4594597816467285, "reward_std": 2.2520837783813477, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.25945988297462463, "rewards/reward_search_strategy": 0.07500000298023224, "step": 6 }, { "completion_length": 144.375, "epoch": 0.024475524475524476, "grad_norm": 1.1642646789550781, "kl": 0.0009361266857013106, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": 1.378964900970459, "reward_std": 2.0963218212127686, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30396491289138794, "rewards/reward_search_strategy": 0.07500000298023224, "step": 7 }, { "completion_length": 314.375, "epoch": 0.027972027972027972, "grad_norm": 5.21930456161499, "kl": 0.0008346753311343491, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 1.181166410446167, "reward_std": 1.6016024351119995, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3561664819717407, "rewards/reward_search_strategy": 0.07500000298023224, "step": 8 }, { "completion_length": 98.375, "epoch": 0.03146853146853147, "grad_norm": 1.6168773174285889, "kl": 0.0009058149298653007, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 1.141206979751587, "reward_std": 2.1089890003204346, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16620691120624542, "rewards/reward_search_strategy": 0.10000000149011612, "step": 9 }, { "completion_length": 269.625, "epoch": 0.03496503496503497, "grad_norm": 1.310072660446167, "kl": 0.000750248203985393, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 1.901651382446289, "reward_std": 3.08128023147583, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.27665144205093384, "rewards/reward_search_strategy": 0.125, "step": 10 }, { "completion_length": 311.125, "epoch": 0.038461538461538464, "grad_norm": 1.1051065921783447, "kl": 0.000834438658785075, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 2.1270594596862793, "reward_std": 2.309929132461548, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.377059668302536, "rewards/reward_search_strategy": 0.125, "step": 11 }, { "completion_length": 182.375, "epoch": 0.04195804195804196, "grad_norm": 1.3039528131484985, "kl": 0.0007587745785713196, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.24762853980064392, "reward_std": 0.28513866662979126, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.17262855172157288, "rewards/reward_search_strategy": 0.07500000298023224, "step": 12 }, { "completion_length": 198.5, "epoch": 0.045454545454545456, "grad_norm": 1.176461935043335, "kl": 0.0008510244661010802, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 2.0060770511627197, "reward_std": 2.2957346439361572, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3060770034790039, "rewards/reward_search_strategy": 0.07500000298023224, "step": 13 }, { "completion_length": 158.25, "epoch": 0.04895104895104895, "grad_norm": 1.1484898328781128, "kl": 0.0009028888889588416, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": 0.2598831355571747, "reward_std": 0.4156216084957123, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1348831057548523, "rewards/reward_search_strategy": 0.0, "step": 14 }, { "completion_length": 321.125, "epoch": 0.05244755244755245, "grad_norm": 1.08210027217865, "kl": 0.0007499511120840907, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 2.711674213409424, "reward_std": 3.2537524700164795, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3866744041442871, "rewards/reward_search_strategy": 0.20000000298023224, "step": 15 }, { "completion_length": 176.0, "epoch": 0.055944055944055944, "grad_norm": 1.007460117340088, "kl": 0.0008281145128421485, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 1.6642776727676392, "reward_std": 3.1836605072021484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1892777681350708, "rewards/reward_search_strategy": 0.10000000149011612, "step": 16 }, { "completion_length": 299.0, "epoch": 0.05944055944055944, "grad_norm": 0.9308914542198181, "kl": 0.0006533896084874868, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 2.9600870609283447, "reward_std": 3.002318859100342, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.38508710265159607, "rewards/reward_search_strategy": 0.07500000298023224, "step": 17 }, { "completion_length": 170.125, "epoch": 0.06293706293706294, "grad_norm": 1.6546720266342163, "kl": 0.0009436010150238872, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": 1.807711124420166, "reward_std": 2.873779773712158, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.33271118998527527, "rewards/reward_search_strategy": 0.22499999403953552, "step": 18 }, { "completion_length": 377.125, "epoch": 0.06643356643356643, "grad_norm": 0.7977221608161926, "kl": 0.0008231342071667314, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 0.7038719058036804, "reward_std": 1.2563936710357666, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.22887186706066132, "rewards/reward_search_strategy": 0.10000000149011612, "step": 19 }, { "completion_length": 161.625, "epoch": 0.06993006993006994, "grad_norm": 1.5974496603012085, "kl": 0.000790413178037852, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 2.404862880706787, "reward_std": 2.712550163269043, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.179862841963768, "rewards/reward_search_strategy": 0.10000000149011612, "step": 20 }, { "completion_length": 298.875, "epoch": 0.07342657342657342, "grad_norm": 0.8817616105079651, "kl": 0.0007674504304304719, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": 2.6999998092651367, "reward_std": 3.122270345687866, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.05000000074505806, "rewards/reward_search_strategy": 0.2750000059604645, "step": 21 }, { "completion_length": 150.125, "epoch": 0.07692307692307693, "grad_norm": 1.345902919769287, "kl": 0.0010101046646013856, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 1.2625212669372559, "reward_std": 1.9802589416503906, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.13752126693725586, "rewards/reward_search_strategy": 0.0, "step": 22 }, { "completion_length": 156.625, "epoch": 0.08041958041958042, "grad_norm": 3.1841650009155273, "kl": 0.0009282200480811298, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": 2.532832145690918, "reward_std": 3.623222827911377, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.18283231556415558, "rewards/reward_search_strategy": 0.22500000894069672, "step": 23 }, { "completion_length": 230.625, "epoch": 0.08391608391608392, "grad_norm": 0.560590922832489, "kl": 0.0008200581069104373, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.7196725606918335, "reward_std": 1.5131947994232178, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1946725845336914, "rewards/reward_search_strategy": 0.02500000037252903, "step": 24 }, { "completion_length": 232.5, "epoch": 0.08741258741258741, "grad_norm": 5.627699375152588, "kl": 0.0007359281880781054, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 2.3161444664001465, "reward_std": 2.266465425491333, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.31614452600479126, "rewards/reward_search_strategy": 0.0, "step": 25 }, { "completion_length": 256.625, "epoch": 0.09090909090909091, "grad_norm": 1.243622064590454, "kl": 0.0009776452789083123, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 1.8542793989181519, "reward_std": 2.3880927562713623, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.22927939891815186, "rewards/reward_search_strategy": 0.0, "step": 26 }, { "completion_length": 221.625, "epoch": 0.0944055944055944, "grad_norm": 2.455430030822754, "kl": 0.0014260835014283657, "learning_rate": 1.3500000000000002e-06, "loss": 0.0001, "reward": 1.0012282133102417, "reward_std": 1.7000349760055542, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.15122826397418976, "rewards/reward_search_strategy": 0.10000000149011612, "step": 27 }, { "completion_length": 185.75, "epoch": 0.0979020979020979, "grad_norm": 1.199497938156128, "kl": 0.0007826727814972401, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 2.173332691192627, "reward_std": 2.801347255706787, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.32333269715309143, "rewards/reward_search_strategy": 0.10000000149011612, "step": 28 }, { "completion_length": 243.125, "epoch": 0.10139860139860139, "grad_norm": 0.8430306315422058, "kl": 0.0008962135761976242, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 1.0625323057174683, "reward_std": 1.898645043373108, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18753226101398468, "rewards/reward_search_strategy": 0.0, "step": 29 }, { "completion_length": 334.875, "epoch": 0.1048951048951049, "grad_norm": 0.8412113785743713, "kl": 0.001170428702607751, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 1.303612232208252, "reward_std": 2.1826775074005127, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.27861225605010986, "rewards/reward_search_strategy": 0.02500000037252903, "step": 30 }, { "completion_length": 165.75, "epoch": 0.10839160839160839, "grad_norm": 1.9105679988861084, "kl": 0.0031302073039114475, "learning_rate": 1.5500000000000002e-06, "loss": 0.0001, "reward": 1.0198408365249634, "reward_std": 2.1316380500793457, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16984085738658905, "rewards/reward_search_strategy": 0.10000000149011612, "step": 31 }, { "completion_length": 174.625, "epoch": 0.11188811188811189, "grad_norm": 2.309382915496826, "kl": 0.0032356895972043276, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "reward": 0.9216341376304626, "reward_std": 1.8671420812606812, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14663413166999817, "rewards/reward_search_strategy": 0.02500000037252903, "step": 32 }, { "completion_length": 85.75, "epoch": 0.11538461538461539, "grad_norm": 1.4122040271759033, "kl": 0.0017049856251105666, "learning_rate": 1.6500000000000003e-06, "loss": 0.0001, "reward": 1.524999976158142, "reward_std": 2.2926902770996094, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.10000000149011612, "rewards/reward_search_strategy": 0.05000000074505806, "step": 33 }, { "completion_length": 173.5, "epoch": 0.11888111888111888, "grad_norm": 0.9712541699409485, "kl": 0.0014842856908217072, "learning_rate": 1.7000000000000002e-06, "loss": 0.0001, "reward": 3.4844837188720703, "reward_std": 3.1145381927490234, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3094840347766876, "rewards/reward_search_strategy": 0.17499999701976776, "step": 34 }, { "completion_length": 268.25, "epoch": 0.12237762237762238, "grad_norm": 1.1684075593948364, "kl": 0.0012478481512516737, "learning_rate": 1.75e-06, "loss": 0.0, "reward": 2.8716678619384766, "reward_std": 2.387646198272705, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3716679811477661, "rewards/reward_search_strategy": 0.125, "step": 35 }, { "completion_length": 235.125, "epoch": 0.1258741258741259, "grad_norm": 1.2749513387680054, "kl": 0.004472827073186636, "learning_rate": 1.8000000000000001e-06, "loss": 0.0002, "reward": 1.8775699138641357, "reward_std": 2.413510322570801, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30256983637809753, "rewards/reward_search_strategy": 0.07500000298023224, "step": 36 }, { "completion_length": 175.5, "epoch": 0.12937062937062938, "grad_norm": 0.8929537534713745, "kl": 0.0037049425300210714, "learning_rate": 1.85e-06, "loss": 0.0001, "reward": 0.6821428537368774, "reward_std": 1.4377206563949585, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18214285373687744, "rewards/reward_search_strategy": 0.0, "step": 37 }, { "completion_length": 290.125, "epoch": 0.13286713286713286, "grad_norm": 2.52665114402771, "kl": 0.005157872103154659, "learning_rate": 1.9000000000000002e-06, "loss": 0.0002, "reward": 2.574741840362549, "reward_std": 2.5146002769470215, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14974190294742584, "rewards/reward_search_strategy": 0.17500001192092896, "step": 38 }, { "completion_length": 344.125, "epoch": 0.13636363636363635, "grad_norm": 0.8259175419807434, "kl": 0.002582128159701824, "learning_rate": 1.9500000000000004e-06, "loss": 0.0001, "reward": 2.376612901687622, "reward_std": 2.9910624027252197, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4266127943992615, "rewards/reward_search_strategy": 0.07500000298023224, "step": 39 }, { "completion_length": 243.125, "epoch": 0.13986013986013987, "grad_norm": 1.0021271705627441, "kl": 0.004831339232623577, "learning_rate": 2.0000000000000003e-06, "loss": 0.0002, "reward": 2.04349946975708, "reward_std": 2.3903656005859375, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.34349945187568665, "rewards/reward_search_strategy": 0.07500000298023224, "step": 40 }, { "completion_length": 207.375, "epoch": 0.14335664335664336, "grad_norm": 1.2446260452270508, "kl": 0.005710378754884005, "learning_rate": 2.05e-06, "loss": 0.0002, "reward": 2.8324475288391113, "reward_std": 3.167931079864502, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.25744765996932983, "rewards/reward_search_strategy": 0.20000000298023224, "step": 41 }, { "completion_length": 441.125, "epoch": 0.14685314685314685, "grad_norm": 0.8263920545578003, "kl": 0.0030913222581148148, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "reward": 2.5794529914855957, "reward_std": 2.6344144344329834, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30445292592048645, "rewards/reward_search_strategy": 0.15000000596046448, "step": 42 }, { "completion_length": 226.375, "epoch": 0.15034965034965034, "grad_norm": 1.141964316368103, "kl": 0.008909309282898903, "learning_rate": 2.15e-06, "loss": 0.0004, "reward": 1.8603672981262207, "reward_std": 2.203197479248047, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.21036729216575623, "rewards/reward_search_strategy": 0.02500000037252903, "step": 43 }, { "completion_length": 397.75, "epoch": 0.15384615384615385, "grad_norm": 0.9564006328582764, "kl": 0.01092858798801899, "learning_rate": 2.2e-06, "loss": 0.0004, "reward": 2.557056188583374, "reward_std": 2.429002046585083, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.28205621242523193, "rewards/reward_search_strategy": 0.02500000037252903, "step": 44 }, { "completion_length": 307.25, "epoch": 0.15734265734265734, "grad_norm": 1.0510841608047485, "kl": 0.007918575778603554, "learning_rate": 2.25e-06, "loss": 0.0003, "reward": 1.2331278324127197, "reward_std": 1.7548075914382935, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.15812794864177704, "rewards/reward_search_strategy": 0.07500000298023224, "step": 45 }, { "completion_length": 277.0, "epoch": 0.16083916083916083, "grad_norm": 0.9169591665267944, "kl": 0.015062836930155754, "learning_rate": 2.3000000000000004e-06, "loss": 0.0006, "reward": 1.2089338302612305, "reward_std": 2.5115318298339844, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1339338719844818, "rewards/reward_search_strategy": 0.20000000298023224, "step": 46 }, { "completion_length": 206.25, "epoch": 0.16433566433566432, "grad_norm": 1.3427180051803589, "kl": 0.019624339416623116, "learning_rate": 2.35e-06, "loss": 0.0008, "reward": 4.1133527755737305, "reward_std": 3.2583820819854736, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.36335283517837524, "rewards/reward_search_strategy": 0.125, "step": 47 }, { "completion_length": 252.125, "epoch": 0.16783216783216784, "grad_norm": 0.7444542050361633, "kl": 0.021463895216584206, "learning_rate": 2.4000000000000003e-06, "loss": 0.0009, "reward": 1.631263017654419, "reward_std": 2.5925517082214355, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.23126289248466492, "rewards/reward_search_strategy": 0.02500000037252903, "step": 48 }, { "completion_length": 117.625, "epoch": 0.17132867132867133, "grad_norm": 1.888673186302185, "kl": 0.03609447553753853, "learning_rate": 2.4500000000000003e-06, "loss": 0.0014, "reward": 1.9036250114440918, "reward_std": 2.2273151874542236, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.20362500846385956, "rewards/reward_search_strategy": 0.07500000298023224, "step": 49 }, { "completion_length": 302.75, "epoch": 0.17482517482517482, "grad_norm": 1.2723031044006348, "kl": 0.2012804001569748, "learning_rate": 2.5e-06, "loss": 0.0081, "reward": 0.15956448018550873, "reward_std": 0.4513165056705475, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.03456447646021843, "rewards/reward_search_strategy": 0.0, "step": 50 }, { "completion_length": 222.0, "epoch": 0.17832167832167833, "grad_norm": 0.8149501085281372, "kl": 0.04784020781517029, "learning_rate": 2.55e-06, "loss": 0.0019, "reward": 0.49046826362609863, "reward_std": 0.9328532814979553, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1654682457447052, "rewards/reward_search_strategy": 0.07500000298023224, "step": 51 }, { "completion_length": 329.75, "epoch": 0.18181818181818182, "grad_norm": 1.0090314149856567, "kl": 0.03431350365281105, "learning_rate": 2.6e-06, "loss": 0.0014, "reward": 1.8717395067214966, "reward_std": 2.0185225009918213, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.246739462018013, "rewards/reward_search_strategy": 0.25, "step": 52 }, { "completion_length": 292.5, "epoch": 0.1853146853146853, "grad_norm": 0.9738456606864929, "kl": 0.03869509696960449, "learning_rate": 2.6500000000000005e-06, "loss": 0.0015, "reward": 3.411203384399414, "reward_std": 2.776095390319824, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.28620344400405884, "rewards/reward_search_strategy": 0.0, "step": 53 }, { "completion_length": 262.25, "epoch": 0.1888111888111888, "grad_norm": 1.0264172554016113, "kl": 0.04221673682332039, "learning_rate": 2.7000000000000004e-06, "loss": 0.0017, "reward": 3.2230210304260254, "reward_std": 3.179140329360962, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.2730211913585663, "rewards/reward_search_strategy": 0.32500001788139343, "step": 54 }, { "completion_length": 278.0, "epoch": 0.19230769230769232, "grad_norm": 1.219490885734558, "kl": 0.07308313995599747, "learning_rate": 2.7500000000000004e-06, "loss": 0.0029, "reward": 2.9714736938476562, "reward_std": 3.078878164291382, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.32147371768951416, "rewards/reward_search_strategy": 0.2750000059604645, "step": 55 }, { "completion_length": 198.125, "epoch": 0.1958041958041958, "grad_norm": 2.261676788330078, "kl": 0.10734312981367111, "learning_rate": 2.8000000000000003e-06, "loss": 0.0043, "reward": 1.751394271850586, "reward_std": 2.916086196899414, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.35139432549476624, "rewards/reward_search_strategy": 0.15000000596046448, "step": 56 }, { "completion_length": 368.25, "epoch": 0.1993006993006993, "grad_norm": 1.0139119625091553, "kl": 0.07945723086595535, "learning_rate": 2.85e-06, "loss": 0.0032, "reward": 2.6666977405548096, "reward_std": 3.0744428634643555, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1416977047920227, "rewards/reward_search_strategy": 0.15000000596046448, "step": 57 }, { "completion_length": 445.875, "epoch": 0.20279720279720279, "grad_norm": 1.2591164112091064, "kl": 0.0817643404006958, "learning_rate": 2.9e-06, "loss": 0.0033, "reward": 3.105250358581543, "reward_std": 2.503899574279785, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4302505850791931, "rewards/reward_search_strategy": 0.17499999701976776, "step": 58 }, { "completion_length": 465.875, "epoch": 0.2062937062937063, "grad_norm": 0.6122844219207764, "kl": 0.0703798234462738, "learning_rate": 2.95e-06, "loss": 0.0028, "reward": 1.6243177652359009, "reward_std": 1.8378466367721558, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.37431782484054565, "rewards/reward_search_strategy": 0.125, "step": 59 }, { "completion_length": 256.375, "epoch": 0.2097902097902098, "grad_norm": 0.9197457432746887, "kl": 0.08273329585790634, "learning_rate": 3e-06, "loss": 0.0033, "reward": 2.420247793197632, "reward_std": 2.3122122287750244, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4202479422092438, "rewards/reward_search_strategy": 0.25, "step": 60 }, { "completion_length": 517.125, "epoch": 0.21328671328671328, "grad_norm": 0.8799907565116882, "kl": 0.20334148406982422, "learning_rate": 3.05e-06, "loss": 0.0081, "reward": 1.377384066581726, "reward_std": 1.68935227394104, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3023841083049774, "rewards/reward_search_strategy": 0.07500000298023224, "step": 61 }, { "completion_length": 502.5, "epoch": 0.21678321678321677, "grad_norm": 0.7109338641166687, "kl": 0.03114498406648636, "learning_rate": 3.1000000000000004e-06, "loss": 0.0012, "reward": 5.466271877288818, "reward_std": 2.7285211086273193, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.6912716627120972, "rewards/reward_search_strategy": 0.4000000059604645, "step": 62 }, { "completion_length": 503.25, "epoch": 0.2202797202797203, "grad_norm": 0.9971331357955933, "kl": 0.20209850370883942, "learning_rate": 3.1500000000000003e-06, "loss": 0.0081, "reward": 1.2392462491989136, "reward_std": 1.4006211757659912, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.189246267080307, "rewards/reward_search_strategy": 0.17500001192092896, "step": 63 }, { "completion_length": 288.5, "epoch": 0.22377622377622378, "grad_norm": 0.8857221603393555, "kl": 0.10349184274673462, "learning_rate": 3.2000000000000003e-06, "loss": 0.0041, "reward": 1.3573650121688843, "reward_std": 1.9148555994033813, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.40736502408981323, "rewards/reward_search_strategy": 0.07500000298023224, "step": 64 }, { "completion_length": 445.875, "epoch": 0.22727272727272727, "grad_norm": 2.050611734390259, "kl": 0.9011820554733276, "learning_rate": 3.2500000000000002e-06, "loss": 0.036, "reward": 2.7885334491729736, "reward_std": 2.6393096446990967, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.46353352069854736, "rewards/reward_search_strategy": 0.20000000298023224, "step": 65 }, { "completion_length": 321.75, "epoch": 0.23076923076923078, "grad_norm": 0.8712911605834961, "kl": 0.11524824798107147, "learning_rate": 3.3000000000000006e-06, "loss": 0.0046, "reward": 2.963777542114258, "reward_std": 2.7310569286346436, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.21377724409103394, "rewards/reward_search_strategy": 0.125, "step": 66 }, { "completion_length": 579.5, "epoch": 0.23426573426573427, "grad_norm": 0.6312026977539062, "kl": 0.08477935940027237, "learning_rate": 3.3500000000000005e-06, "loss": 0.0034, "reward": 4.128048896789551, "reward_std": 2.959887742996216, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.30304890871047974, "rewards/reward_search_strategy": 0.20000000298023224, "step": 67 }, { "completion_length": 256.625, "epoch": 0.23776223776223776, "grad_norm": 0.835229218006134, "kl": 0.1031595766544342, "learning_rate": 3.4000000000000005e-06, "loss": 0.0041, "reward": 5.231247425079346, "reward_std": 2.7173686027526855, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.48124733567237854, "rewards/reward_search_strategy": 0.25, "step": 68 }, { "completion_length": 458.125, "epoch": 0.24125874125874125, "grad_norm": 1.0669630765914917, "kl": 0.050319548696279526, "learning_rate": 3.45e-06, "loss": 0.002, "reward": 2.627213716506958, "reward_std": 2.752366781234741, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1022137925028801, "rewards/reward_search_strategy": 0.4000000059604645, "step": 69 }, { "completion_length": 758.125, "epoch": 0.24475524475524477, "grad_norm": 0.4896698594093323, "kl": 0.03787853941321373, "learning_rate": 3.5e-06, "loss": 0.0015, "reward": 3.715177059173584, "reward_std": 2.108518362045288, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.44017714262008667, "rewards/reward_search_strategy": 0.3999999761581421, "step": 70 }, { "completion_length": 359.625, "epoch": 0.24825174825174826, "grad_norm": 1.1720679998397827, "kl": 0.09393578767776489, "learning_rate": 3.5500000000000003e-06, "loss": 0.0038, "reward": 5.1651530265808105, "reward_std": 2.7064919471740723, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.39015328884124756, "rewards/reward_search_strategy": 0.5249999761581421, "step": 71 }, { "completion_length": 603.75, "epoch": 0.2517482517482518, "grad_norm": 0.92381352186203, "kl": 0.03434322774410248, "learning_rate": 3.6000000000000003e-06, "loss": 0.0014, "reward": 1.6090435981750488, "reward_std": 1.3871439695358276, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.5090435743331909, "rewards/reward_search_strategy": 0.22500000894069672, "step": 72 }, { "completion_length": 324.125, "epoch": 0.25524475524475526, "grad_norm": 0.8183465003967285, "kl": 0.09460737556219101, "learning_rate": 3.65e-06, "loss": 0.0038, "reward": 1.5781946182250977, "reward_std": 2.772611618041992, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.25319457054138184, "rewards/reward_search_strategy": 0.07500000298023224, "step": 73 }, { "completion_length": 479.5, "epoch": 0.25874125874125875, "grad_norm": 0.8604242205619812, "kl": 0.053619399666786194, "learning_rate": 3.7e-06, "loss": 0.0021, "reward": 4.117947578430176, "reward_std": 2.7907135486602783, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.41794782876968384, "rewards/reward_search_strategy": 0.20000000298023224, "step": 74 }, { "completion_length": 450.25, "epoch": 0.26223776223776224, "grad_norm": 2.1643834114074707, "kl": 0.19324736297130585, "learning_rate": 3.7500000000000005e-06, "loss": 0.0077, "reward": 4.815071105957031, "reward_std": 3.0326616764068604, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39007121324539185, "rewards/reward_search_strategy": 0.42500001192092896, "step": 75 }, { "completion_length": 412.25, "epoch": 0.26573426573426573, "grad_norm": 0.8062530159950256, "kl": 0.0793648362159729, "learning_rate": 3.8000000000000005e-06, "loss": 0.0032, "reward": 3.5049312114715576, "reward_std": 2.922109603881836, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5549312829971313, "rewards/reward_search_strategy": 0.20000000298023224, "step": 76 }, { "completion_length": 369.75, "epoch": 0.2692307692307692, "grad_norm": 1.467880368232727, "kl": 0.0835878923535347, "learning_rate": 3.85e-06, "loss": 0.0033, "reward": 2.802757978439331, "reward_std": 2.5307087898254395, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3277580440044403, "rewards/reward_search_strategy": 0.22500000894069672, "step": 77 }, { "completion_length": 382.375, "epoch": 0.2727272727272727, "grad_norm": 4.771082401275635, "kl": 0.9713490605354309, "learning_rate": 3.900000000000001e-06, "loss": 0.0389, "reward": 2.052755117416382, "reward_std": 3.0059754848480225, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3777550160884857, "rewards/reward_search_strategy": 0.30000001192092896, "step": 78 }, { "completion_length": 546.125, "epoch": 0.2762237762237762, "grad_norm": 0.48559486865997314, "kl": 0.037933606654405594, "learning_rate": 3.95e-06, "loss": 0.0015, "reward": 4.67194938659668, "reward_std": 1.9323244094848633, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5219494700431824, "rewards/reward_search_strategy": 0.3999999761581421, "step": 79 }, { "completion_length": 677.25, "epoch": 0.27972027972027974, "grad_norm": 0.42382290959358215, "kl": 0.0389033704996109, "learning_rate": 4.000000000000001e-06, "loss": 0.0016, "reward": 1.6928536891937256, "reward_std": 2.6118526458740234, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.367853581905365, "rewards/reward_search_strategy": 0.20000000298023224, "step": 80 }, { "completion_length": 472.125, "epoch": 0.28321678321678323, "grad_norm": 0.8816371560096741, "kl": 0.07524556666612625, "learning_rate": 4.05e-06, "loss": 0.003, "reward": 3.7551965713500977, "reward_std": 2.652488946914673, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4801965653896332, "rewards/reward_search_strategy": 0.15000000596046448, "step": 81 }, { "completion_length": 1057.125, "epoch": 0.2867132867132867, "grad_norm": 0.5135111808776855, "kl": 0.03152136504650116, "learning_rate": 4.1e-06, "loss": 0.0013, "reward": 3.2323527336120605, "reward_std": 1.7208728790283203, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.682352602481842, "rewards/reward_search_strategy": 0.42500001192092896, "step": 82 }, { "completion_length": 485.375, "epoch": 0.2902097902097902, "grad_norm": 0.9151804447174072, "kl": 0.059312522411346436, "learning_rate": 4.15e-06, "loss": 0.0024, "reward": 1.5200467109680176, "reward_std": 1.384164810180664, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4450467824935913, "rewards/reward_search_strategy": 0.07500000298023224, "step": 83 }, { "completion_length": 703.25, "epoch": 0.2937062937062937, "grad_norm": 0.708080530166626, "kl": 0.02395041473209858, "learning_rate": 4.2000000000000004e-06, "loss": 0.001, "reward": 4.584571838378906, "reward_std": 2.8503193855285645, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4345718026161194, "rewards/reward_search_strategy": 0.4000000059604645, "step": 84 }, { "completion_length": 507.75, "epoch": 0.2972027972027972, "grad_norm": 0.9842731356620789, "kl": 0.045889340341091156, "learning_rate": 4.25e-06, "loss": 0.0018, "reward": 6.216353893280029, "reward_std": 3.617387056350708, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4663536846637726, "rewards/reward_search_strategy": 0.75, "step": 85 }, { "completion_length": 362.75, "epoch": 0.3006993006993007, "grad_norm": 5.553736686706543, "kl": 0.31082066893577576, "learning_rate": 4.3e-06, "loss": 0.0124, "reward": 2.6352591514587402, "reward_std": 3.6418111324310303, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.18525923788547516, "rewards/reward_search_strategy": 0.32499998807907104, "step": 86 }, { "completion_length": 760.625, "epoch": 0.3041958041958042, "grad_norm": 0.40838149189949036, "kl": 0.03904329240322113, "learning_rate": 4.350000000000001e-06, "loss": 0.0016, "reward": 2.3138082027435303, "reward_std": 2.5393922328948975, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.43880823254585266, "rewards/reward_search_strategy": 0.375, "step": 87 }, { "completion_length": 233.875, "epoch": 0.3076923076923077, "grad_norm": 1.6991461515426636, "kl": 0.11344132572412491, "learning_rate": 4.4e-06, "loss": 0.0045, "reward": 5.137988090515137, "reward_std": 3.6587538719177246, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.28798776865005493, "rewards/reward_search_strategy": 0.4750000238418579, "step": 88 }, { "completion_length": 634.0, "epoch": 0.3111888111888112, "grad_norm": 1.008798599243164, "kl": 0.13237585127353668, "learning_rate": 4.450000000000001e-06, "loss": 0.0053, "reward": 2.58494234085083, "reward_std": 3.3475492000579834, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30994224548339844, "rewards/reward_search_strategy": 0.3999999761581421, "step": 89 }, { "completion_length": 673.875, "epoch": 0.3146853146853147, "grad_norm": 0.38143813610076904, "kl": 0.04431832954287529, "learning_rate": 4.5e-06, "loss": 0.0018, "reward": 3.379831552505493, "reward_std": 2.6894783973693848, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5048315525054932, "rewards/reward_search_strategy": 0.375, "step": 90 }, { "completion_length": 335.5, "epoch": 0.3181818181818182, "grad_norm": 1.326707124710083, "kl": 0.1567000150680542, "learning_rate": 4.5500000000000005e-06, "loss": 0.0063, "reward": 1.540712594985962, "reward_std": 1.9064555168151855, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.24071267247200012, "rewards/reward_search_strategy": 0.30000001192092896, "step": 91 }, { "completion_length": 479.25, "epoch": 0.32167832167832167, "grad_norm": 0.8233426213264465, "kl": 0.046305958181619644, "learning_rate": 4.600000000000001e-06, "loss": 0.0019, "reward": 4.979320526123047, "reward_std": 2.5221211910247803, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6293203234672546, "rewards/reward_search_strategy": 0.3500000238418579, "step": 92 }, { "completion_length": 529.625, "epoch": 0.32517482517482516, "grad_norm": 0.8791444301605225, "kl": 0.049900904297828674, "learning_rate": 4.65e-06, "loss": 0.002, "reward": 4.73832893371582, "reward_std": 2.957406520843506, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5633291006088257, "rewards/reward_search_strategy": 0.550000011920929, "step": 93 }, { "completion_length": 566.25, "epoch": 0.32867132867132864, "grad_norm": 0.93157559633255, "kl": 0.10382921993732452, "learning_rate": 4.7e-06, "loss": 0.0042, "reward": 3.921233654022217, "reward_std": 3.259979486465454, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39623361825942993, "rewards/reward_search_strategy": 0.5249999761581421, "step": 94 }, { "completion_length": 405.0, "epoch": 0.3321678321678322, "grad_norm": 0.6553046703338623, "kl": 0.05014060437679291, "learning_rate": 4.75e-06, "loss": 0.002, "reward": 4.048189163208008, "reward_std": 3.046299934387207, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3731893002986908, "rewards/reward_search_strategy": 0.550000011920929, "step": 95 }, { "completion_length": 559.0, "epoch": 0.3356643356643357, "grad_norm": 0.9019679427146912, "kl": 0.1154535710811615, "learning_rate": 4.800000000000001e-06, "loss": 0.0046, "reward": 3.3237497806549072, "reward_std": 2.755406379699707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.19874979555606842, "rewards/reward_search_strategy": 0.375, "step": 96 }, { "completion_length": 513.125, "epoch": 0.33916083916083917, "grad_norm": 1.1792876720428467, "kl": 0.20403671264648438, "learning_rate": 4.85e-06, "loss": 0.0082, "reward": 4.584352970123291, "reward_std": 3.532867908477783, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4093528091907501, "rewards/reward_search_strategy": 0.550000011920929, "step": 97 }, { "completion_length": 390.75, "epoch": 0.34265734265734266, "grad_norm": 24.102558135986328, "kl": 1.4576557874679565, "learning_rate": 4.9000000000000005e-06, "loss": 0.0583, "reward": 3.2650551795959473, "reward_std": 1.5778415203094482, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4150553047657013, "rewards/reward_search_strategy": 0.4749999940395355, "step": 98 }, { "completion_length": 992.0, "epoch": 0.34615384615384615, "grad_norm": 1.07889723777771, "kl": 0.15134873986244202, "learning_rate": 4.95e-06, "loss": 0.0061, "reward": 3.4669156074523926, "reward_std": 3.278291940689087, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.31691551208496094, "rewards/reward_search_strategy": 0.4000000059604645, "step": 99 }, { "completion_length": 839.625, "epoch": 0.34965034965034963, "grad_norm": 0.5948315262794495, "kl": 0.03611273318529129, "learning_rate": 5e-06, "loss": 0.0014, "reward": 2.273786783218384, "reward_std": 1.3839948177337646, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5487868189811707, "rewards/reward_search_strategy": 0.4750000238418579, "step": 100 }, { "completion_length": 658.5, "epoch": 0.3531468531468531, "grad_norm": 0.6043696999549866, "kl": 0.07113679498434067, "learning_rate": 4.999984769144476e-06, "loss": 0.0028, "reward": 4.308359622955322, "reward_std": 2.9922401905059814, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5333598256111145, "rewards/reward_search_strategy": 0.5250000357627869, "step": 101 }, { "completion_length": 658.375, "epoch": 0.35664335664335667, "grad_norm": 0.6331404447555542, "kl": 0.06461314111948013, "learning_rate": 4.999939076763487e-06, "loss": 0.0026, "reward": 4.334439277648926, "reward_std": 3.60390567779541, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3594394326210022, "rewards/reward_search_strategy": 0.6000000238418579, "step": 102 }, { "completion_length": 549.875, "epoch": 0.36013986013986016, "grad_norm": 0.9000433087348938, "kl": 0.18242445588111877, "learning_rate": 4.999862923413781e-06, "loss": 0.0073, "reward": 3.720637559890747, "reward_std": 3.3442742824554443, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3706376552581787, "rewards/reward_search_strategy": 0.4750000238418579, "step": 103 }, { "completion_length": 560.875, "epoch": 0.36363636363636365, "grad_norm": 0.9529889822006226, "kl": 0.13583947718143463, "learning_rate": 4.999756310023261e-06, "loss": 0.0054, "reward": 5.090951442718506, "reward_std": 3.163343906402588, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.41595128178596497, "rewards/reward_search_strategy": 0.42500001192092896, "step": 104 }, { "completion_length": 784.5, "epoch": 0.36713286713286714, "grad_norm": 0.5621578693389893, "kl": 0.06874603033065796, "learning_rate": 4.9996192378909785e-06, "loss": 0.0027, "reward": 1.6759297847747803, "reward_std": 1.3889375925064087, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4009298086166382, "rewards/reward_search_strategy": 0.2750000059604645, "step": 105 }, { "completion_length": 811.875, "epoch": 0.3706293706293706, "grad_norm": 0.3865280747413635, "kl": 0.03063635341823101, "learning_rate": 4.999451708687114e-06, "loss": 0.0012, "reward": 1.8364933729171753, "reward_std": 1.0776033401489258, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.186493381857872, "rewards/reward_search_strategy": 0.6499999761581421, "step": 106 }, { "completion_length": 767.0, "epoch": 0.3741258741258741, "grad_norm": 1.0235356092453003, "kl": 0.05863216146826744, "learning_rate": 4.9992537244529585e-06, "loss": 0.0023, "reward": 3.36350154876709, "reward_std": 2.900977611541748, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43850138783454895, "rewards/reward_search_strategy": 0.6749999523162842, "step": 107 }, { "completion_length": 471.125, "epoch": 0.3776223776223776, "grad_norm": 1.1427947282791138, "kl": 0.05708540976047516, "learning_rate": 4.999025287600886e-06, "loss": 0.0023, "reward": 3.4347455501556396, "reward_std": 3.3557960987091064, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.28474536538124084, "rewards/reward_search_strategy": 0.4000000059604645, "step": 108 }, { "completion_length": 680.5, "epoch": 0.3811188811188811, "grad_norm": 0.5091893076896667, "kl": 0.06495320051908493, "learning_rate": 4.998766400914329e-06, "loss": 0.0026, "reward": 5.424283981323242, "reward_std": 3.0205914974212646, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2242841124534607, "rewards/reward_search_strategy": 0.824999988079071, "step": 109 }, { "completion_length": 546.875, "epoch": 0.38461538461538464, "grad_norm": 0.7380320429801941, "kl": 0.080781489610672, "learning_rate": 4.99847706754774e-06, "loss": 0.0032, "reward": 3.1829848289489746, "reward_std": 1.9441657066345215, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.45798495411872864, "rewards/reward_search_strategy": 0.22500000894069672, "step": 110 }, { "completion_length": 696.125, "epoch": 0.3881118881118881, "grad_norm": 0.7519396543502808, "kl": 0.0903296023607254, "learning_rate": 4.998157291026553e-06, "loss": 0.0036, "reward": 3.140177011489868, "reward_std": 2.6969096660614014, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4151769280433655, "rewards/reward_search_strategy": 0.3500000238418579, "step": 111 }, { "completion_length": 530.5, "epoch": 0.3916083916083916, "grad_norm": 1.3212255239486694, "kl": 0.17164835333824158, "learning_rate": 4.997807075247147e-06, "loss": 0.0069, "reward": 2.3374075889587402, "reward_std": 1.7672338485717773, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.41240769624710083, "rewards/reward_search_strategy": 0.30000001192092896, "step": 112 }, { "completion_length": 703.375, "epoch": 0.3951048951048951, "grad_norm": 0.48862800002098083, "kl": 0.04361271113157272, "learning_rate": 4.997426424476787e-06, "loss": 0.0017, "reward": 4.811786651611328, "reward_std": 3.4794275760650635, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.48678621649742126, "rewards/reward_search_strategy": 0.699999988079071, "step": 113 }, { "completion_length": 666.375, "epoch": 0.3986013986013986, "grad_norm": 0.7718109488487244, "kl": 0.040162038058042526, "learning_rate": 4.9970153433535855e-06, "loss": 0.0016, "reward": 3.659369468688965, "reward_std": 2.0675926208496094, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4593694806098938, "rewards/reward_search_strategy": 0.5750000476837158, "step": 114 }, { "completion_length": 640.625, "epoch": 0.4020979020979021, "grad_norm": 0.5138198137283325, "kl": 0.05123981833457947, "learning_rate": 4.9965738368864345e-06, "loss": 0.002, "reward": 2.629551410675049, "reward_std": 1.7166107892990112, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5045512914657593, "rewards/reward_search_strategy": 0.5, "step": 115 }, { "completion_length": 586.75, "epoch": 0.40559440559440557, "grad_norm": 1.4575413465499878, "kl": 0.08412657678127289, "learning_rate": 4.996101910454953e-06, "loss": 0.0034, "reward": 4.0647077560424805, "reward_std": 1.9030133485794067, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4147075116634369, "rewards/reward_search_strategy": 0.40000003576278687, "step": 116 }, { "completion_length": 514.625, "epoch": 0.4090909090909091, "grad_norm": 1.051775574684143, "kl": 0.09374314546585083, "learning_rate": 4.995599569809414e-06, "loss": 0.0037, "reward": 2.5817337036132812, "reward_std": 2.1470091342926025, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30673372745513916, "rewards/reward_search_strategy": 0.2750000059604645, "step": 117 }, { "completion_length": 814.375, "epoch": 0.4125874125874126, "grad_norm": 0.4415126144886017, "kl": 0.07567695528268814, "learning_rate": 4.9950668210706795e-06, "loss": 0.003, "reward": 2.189971685409546, "reward_std": 1.4690890312194824, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5649716854095459, "rewards/reward_search_strategy": 0.375, "step": 118 }, { "completion_length": 550.375, "epoch": 0.4160839160839161, "grad_norm": 0.50792396068573, "kl": 0.05492626503109932, "learning_rate": 4.994503670730126e-06, "loss": 0.0022, "reward": 4.787566661834717, "reward_std": 2.5005829334259033, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6875668168067932, "rewards/reward_search_strategy": 0.6000000238418579, "step": 119 }, { "completion_length": 609.0, "epoch": 0.4195804195804196, "grad_norm": 0.6223260760307312, "kl": 0.047494076192379, "learning_rate": 4.993910125649561e-06, "loss": 0.0019, "reward": 4.249635696411133, "reward_std": 2.9354586601257324, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3746356964111328, "rewards/reward_search_strategy": 0.3750000298023224, "step": 120 }, { "completion_length": 560.875, "epoch": 0.4230769230769231, "grad_norm": 0.9563676118850708, "kl": 0.09799660742282867, "learning_rate": 4.993286193061145e-06, "loss": 0.0039, "reward": 2.565558910369873, "reward_std": 2.178126811981201, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3405587375164032, "rewards/reward_search_strategy": 0.3500000238418579, "step": 121 }, { "completion_length": 387.625, "epoch": 0.42657342657342656, "grad_norm": 0.7505446076393127, "kl": 0.14984500408172607, "learning_rate": 4.992631880567301e-06, "loss": 0.006, "reward": 1.1751203536987305, "reward_std": 1.6967118978500366, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.20012035965919495, "rewards/reward_search_strategy": 0.22500000894069672, "step": 122 }, { "completion_length": 411.875, "epoch": 0.43006993006993005, "grad_norm": 1.1681523323059082, "kl": 0.0963008776307106, "learning_rate": 4.991947196140619e-06, "loss": 0.0039, "reward": 3.0484721660614014, "reward_std": 3.416553497314453, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3984720706939697, "rewards/reward_search_strategy": 0.4000000059604645, "step": 123 }, { "completion_length": 423.75, "epoch": 0.43356643356643354, "grad_norm": 0.9286041259765625, "kl": 0.07872038334608078, "learning_rate": 4.9912321481237616e-06, "loss": 0.0031, "reward": 3.2697560787200928, "reward_std": 3.4370007514953613, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3947560787200928, "rewards/reward_search_strategy": 0.5, "step": 124 }, { "completion_length": 463.625, "epoch": 0.4370629370629371, "grad_norm": 1.0566741228103638, "kl": 0.0985063686966896, "learning_rate": 4.990486745229364e-06, "loss": 0.0039, "reward": 1.5522143840789795, "reward_std": 1.579607605934143, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.37721437215805054, "rewards/reward_search_strategy": 0.17500001192092896, "step": 125 }, { "completion_length": 685.625, "epoch": 0.4405594405594406, "grad_norm": 0.6656001806259155, "kl": 0.06633864343166351, "learning_rate": 4.989710996539926e-06, "loss": 0.0027, "reward": 2.43574857711792, "reward_std": 2.3410024642944336, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.585748553276062, "rewards/reward_search_strategy": 0.3500000238418579, "step": 126 }, { "completion_length": 637.375, "epoch": 0.44405594405594406, "grad_norm": 0.7161982655525208, "kl": 0.07665619254112244, "learning_rate": 4.9889049115077e-06, "loss": 0.0031, "reward": 3.5046868324279785, "reward_std": 2.549543619155884, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5796867609024048, "rewards/reward_search_strategy": 0.550000011920929, "step": 127 }, { "completion_length": 644.375, "epoch": 0.44755244755244755, "grad_norm": 0.5414532423019409, "kl": 0.06415767967700958, "learning_rate": 4.988068499954578e-06, "loss": 0.0026, "reward": 5.926258563995361, "reward_std": 2.7303335666656494, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6762584447860718, "rewards/reward_search_strategy": 0.625, "step": 128 }, { "completion_length": 397.25, "epoch": 0.45104895104895104, "grad_norm": 0.8846190571784973, "kl": 0.14254269003868103, "learning_rate": 4.987201772071971e-06, "loss": 0.0057, "reward": 2.7677345275878906, "reward_std": 2.424544095993042, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5427343845367432, "rewards/reward_search_strategy": 0.4750000238418579, "step": 129 }, { "completion_length": 463.875, "epoch": 0.45454545454545453, "grad_norm": 0.6526610851287842, "kl": 0.12603093683719635, "learning_rate": 4.986304738420684e-06, "loss": 0.005, "reward": 5.665769577026367, "reward_std": 2.0002214908599854, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1657698005437851, "rewards/reward_search_strategy": 0.5, "step": 130 }, { "completion_length": 425.125, "epoch": 0.458041958041958, "grad_norm": 1.6641560792922974, "kl": 0.2266431599855423, "learning_rate": 4.985377409930789e-06, "loss": 0.0091, "reward": 4.595471382141113, "reward_std": 3.3347935676574707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.42047110199928284, "rewards/reward_search_strategy": 0.550000011920929, "step": 131 }, { "completion_length": 708.75, "epoch": 0.46153846153846156, "grad_norm": 0.49309638142585754, "kl": 0.07724378257989883, "learning_rate": 4.984419797901491e-06, "loss": 0.0031, "reward": 2.9709227085113525, "reward_std": 0.63145512342453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4459228515625, "rewards/reward_search_strategy": 0.5249999761581421, "step": 132 }, { "completion_length": 570.75, "epoch": 0.46503496503496505, "grad_norm": 0.8325252532958984, "kl": 0.15752217173576355, "learning_rate": 4.983431914000991e-06, "loss": 0.0063, "reward": 2.9882287979125977, "reward_std": 2.527655601501465, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.463228702545166, "rewards/reward_search_strategy": 0.5250000357627869, "step": 133 }, { "completion_length": 669.625, "epoch": 0.46853146853146854, "grad_norm": 1.057653784751892, "kl": 0.10220281034708023, "learning_rate": 4.9824137702663424e-06, "loss": 0.0041, "reward": 2.843283176422119, "reward_std": 2.226605176925659, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3932831287384033, "rewards/reward_search_strategy": 0.45000001788139343, "step": 134 }, { "completion_length": 387.25, "epoch": 0.47202797202797203, "grad_norm": 0.7793525457382202, "kl": 0.09381990879774094, "learning_rate": 4.981365379103306e-06, "loss": 0.0038, "reward": 3.3774726390838623, "reward_std": 2.2550406455993652, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37747257947921753, "rewards/reward_search_strategy": 0.5, "step": 135 }, { "completion_length": 454.625, "epoch": 0.4755244755244755, "grad_norm": 0.6707426309585571, "kl": 0.09792742133140564, "learning_rate": 4.980286753286196e-06, "loss": 0.0039, "reward": 5.85577392578125, "reward_std": 2.534198522567749, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5807737112045288, "rewards/reward_search_strategy": 0.6499999761581421, "step": 136 }, { "completion_length": 513.0, "epoch": 0.479020979020979, "grad_norm": 0.91905677318573, "kl": 0.12576593458652496, "learning_rate": 4.979177905957726e-06, "loss": 0.005, "reward": 2.693246603012085, "reward_std": 1.9298425912857056, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.44324639439582825, "rewards/reward_search_strategy": 0.25, "step": 137 }, { "completion_length": 535.0, "epoch": 0.4825174825174825, "grad_norm": 0.6421681642532349, "kl": 0.16189059615135193, "learning_rate": 4.978038850628855e-06, "loss": 0.0065, "reward": 2.3643674850463867, "reward_std": 2.4537479877471924, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.614367663860321, "rewards/reward_search_strategy": 0.5, "step": 138 }, { "completion_length": 388.625, "epoch": 0.486013986013986, "grad_norm": 1.6358879804611206, "kl": 0.31070151925086975, "learning_rate": 4.9768696011786095e-06, "loss": 0.0124, "reward": 7.369345664978027, "reward_std": 2.027414560317993, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.11934606730937958, "rewards/reward_search_strategy": 0.875, "step": 139 }, { "completion_length": 430.375, "epoch": 0.48951048951048953, "grad_norm": 1.3875221014022827, "kl": 0.1299194097518921, "learning_rate": 4.975670171853926e-06, "loss": 0.0052, "reward": 4.600214958190918, "reward_std": 3.4139461517333984, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4752153754234314, "rewards/reward_search_strategy": 0.625, "step": 140 }, { "completion_length": 619.375, "epoch": 0.493006993006993, "grad_norm": 1.2012630701065063, "kl": 0.07434721291065216, "learning_rate": 4.974440577269473e-06, "loss": 0.003, "reward": 3.8403759002685547, "reward_std": 2.251944065093994, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.34037598967552185, "rewards/reward_search_strategy": 0.625, "step": 141 }, { "completion_length": 662.0, "epoch": 0.4965034965034965, "grad_norm": 0.4479227066040039, "kl": 0.08045605570077896, "learning_rate": 4.973180832407471e-06, "loss": 0.0032, "reward": 4.356062889099121, "reward_std": 1.873792052268982, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6060628294944763, "rewards/reward_search_strategy": 0.5, "step": 142 }, { "completion_length": 408.0, "epoch": 0.5, "grad_norm": 0.774638831615448, "kl": 0.16293062269687653, "learning_rate": 4.971890952617515e-06, "loss": 0.0065, "reward": 3.544522762298584, "reward_std": 3.194159507751465, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.31952300667762756, "rewards/reward_search_strategy": 0.3500000238418579, "step": 143 }, { "completion_length": 466.75, "epoch": 0.5034965034965035, "grad_norm": 0.8625693917274475, "kl": 0.10907953977584839, "learning_rate": 4.970570953616383e-06, "loss": 0.0044, "reward": 3.3587875366210938, "reward_std": 2.7467174530029297, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3837875723838806, "rewards/reward_search_strategy": 0.3500000238418579, "step": 144 }, { "completion_length": 856.125, "epoch": 0.506993006993007, "grad_norm": 0.37043091654777527, "kl": 0.05690326541662216, "learning_rate": 4.9692208514878445e-06, "loss": 0.0023, "reward": 5.424896240234375, "reward_std": 2.465031862258911, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49989593029022217, "rewards/reward_search_strategy": 0.7999999523162842, "step": 145 }, { "completion_length": 557.25, "epoch": 0.5104895104895105, "grad_norm": 0.8445433378219604, "kl": 0.06749890744686127, "learning_rate": 4.96784066268247e-06, "loss": 0.0027, "reward": 2.7921762466430664, "reward_std": 1.2953386306762695, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5421763062477112, "rewards/reward_search_strategy": 0.625, "step": 146 }, { "completion_length": 447.5, "epoch": 0.513986013986014, "grad_norm": 0.9278783798217773, "kl": 0.10697238147258759, "learning_rate": 4.966430404017424e-06, "loss": 0.0043, "reward": 3.9682509899139404, "reward_std": 3.156663417816162, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.41825127601623535, "rewards/reward_search_strategy": 0.42500001192092896, "step": 147 }, { "completion_length": 379.625, "epoch": 0.5174825174825175, "grad_norm": 1.5938533544540405, "kl": 0.11513354629278183, "learning_rate": 4.964990092676263e-06, "loss": 0.0046, "reward": 3.4113523960113525, "reward_std": 2.4409544467926025, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2863525450229645, "rewards/reward_search_strategy": 0.5, "step": 148 }, { "completion_length": 411.875, "epoch": 0.5209790209790209, "grad_norm": 1.070616364479065, "kl": 0.2734951078891754, "learning_rate": 4.963519746208726e-06, "loss": 0.0109, "reward": 2.4380977153778076, "reward_std": 2.1429409980773926, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.363097608089447, "rewards/reward_search_strategy": 0.20000000298023224, "step": 149 }, { "completion_length": 355.75, "epoch": 0.5244755244755245, "grad_norm": 0.9825189113616943, "kl": 0.14074762165546417, "learning_rate": 4.962019382530521e-06, "loss": 0.0056, "reward": 6.416107177734375, "reward_std": 3.2488479614257812, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6411075592041016, "rewards/reward_search_strategy": 0.7749999761581421, "step": 150 }, { "completion_length": 642.5, "epoch": 0.527972027972028, "grad_norm": 0.9618121385574341, "kl": 0.07677344232797623, "learning_rate": 4.960489019923105e-06, "loss": 0.0031, "reward": 2.5563745498657227, "reward_std": 1.2503511905670166, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4313744604587555, "rewards/reward_search_strategy": 0.625, "step": 151 }, { "completion_length": 513.375, "epoch": 0.5314685314685315, "grad_norm": 1.855919599533081, "kl": 0.21450263261795044, "learning_rate": 4.958928677033465e-06, "loss": 0.0086, "reward": 3.358847141265869, "reward_std": 1.8732644319534302, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.28384697437286377, "rewards/reward_search_strategy": 0.5750000476837158, "step": 152 }, { "completion_length": 478.375, "epoch": 0.534965034965035, "grad_norm": 0.5649963021278381, "kl": 0.059622377157211304, "learning_rate": 4.957338372873886e-06, "loss": 0.0024, "reward": 3.669475793838501, "reward_std": 3.19222354888916, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4944758117198944, "rewards/reward_search_strategy": 0.550000011920929, "step": 153 }, { "completion_length": 233.25, "epoch": 0.5384615384615384, "grad_norm": 1.1332588195800781, "kl": 0.09504832327365875, "learning_rate": 4.9557181268217225e-06, "loss": 0.0038, "reward": 3.0735678672790527, "reward_std": 2.9650135040283203, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.22356799244880676, "rewards/reward_search_strategy": 0.10000000149011612, "step": 154 }, { "completion_length": 365.75, "epoch": 0.541958041958042, "grad_norm": 0.7829000949859619, "kl": 0.12874586880207062, "learning_rate": 4.9540679586191605e-06, "loss": 0.0051, "reward": 3.2776992321014404, "reward_std": 2.2649874687194824, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4526992440223694, "rewards/reward_search_strategy": 0.44999998807907104, "step": 155 }, { "completion_length": 287.875, "epoch": 0.5454545454545454, "grad_norm": 1.0810236930847168, "kl": 0.17965207993984222, "learning_rate": 4.9523878883729794e-06, "loss": 0.0072, "reward": 5.42661714553833, "reward_std": 3.3766465187072754, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40161705017089844, "rewards/reward_search_strategy": 0.6500000357627869, "step": 156 }, { "completion_length": 562.375, "epoch": 0.548951048951049, "grad_norm": 0.629564642906189, "kl": 0.047585126012563705, "learning_rate": 4.9506779365543054e-06, "loss": 0.0019, "reward": 3.9784889221191406, "reward_std": 0.8592731356620789, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.628489077091217, "rewards/reward_search_strategy": 0.4749999940395355, "step": 157 }, { "completion_length": 333.125, "epoch": 0.5524475524475524, "grad_norm": 0.8598470091819763, "kl": 0.23344238102436066, "learning_rate": 4.94893812399836e-06, "loss": 0.0093, "reward": 5.325415134429932, "reward_std": 2.6435444355010986, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3754151463508606, "rewards/reward_search_strategy": 0.44999998807907104, "step": 158 }, { "completion_length": 369.0, "epoch": 0.5559440559440559, "grad_norm": 1.518742322921753, "kl": 0.1705542653799057, "learning_rate": 4.947168471904213e-06, "loss": 0.0068, "reward": 5.637444972991943, "reward_std": 3.7453742027282715, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.41244518756866455, "rewards/reward_search_strategy": 0.6000000238418579, "step": 159 }, { "completion_length": 409.625, "epoch": 0.5594405594405595, "grad_norm": 1.2653708457946777, "kl": 0.2512824833393097, "learning_rate": 4.9453690018345144e-06, "loss": 0.0101, "reward": 1.8753092288970947, "reward_std": 1.6477832794189453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.32530921697616577, "rewards/reward_search_strategy": 0.30000001192092896, "step": 160 }, { "completion_length": 766.375, "epoch": 0.5629370629370629, "grad_norm": 0.4625481367111206, "kl": 0.056169163435697556, "learning_rate": 4.9435397357152406e-06, "loss": 0.0022, "reward": 3.3957467079162598, "reward_std": 1.1029468774795532, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5957465767860413, "rewards/reward_search_strategy": 0.550000011920929, "step": 161 }, { "completion_length": 749.875, "epoch": 0.5664335664335665, "grad_norm": 0.6248977184295654, "kl": 0.07820535451173782, "learning_rate": 4.9416806958354206e-06, "loss": 0.0031, "reward": 4.84914493560791, "reward_std": 3.21990704536438, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5491449236869812, "rewards/reward_search_strategy": 0.7999999523162842, "step": 162 }, { "completion_length": 318.375, "epoch": 0.5699300699300699, "grad_norm": 2.4732108116149902, "kl": 0.2569376230239868, "learning_rate": 4.939791904846869e-06, "loss": 0.0103, "reward": 5.542656898498535, "reward_std": 3.050025224685669, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.39265698194503784, "rewards/reward_search_strategy": 0.4000000059604645, "step": 163 }, { "completion_length": 638.5, "epoch": 0.5734265734265734, "grad_norm": 0.5316995978355408, "kl": 0.06073322519659996, "learning_rate": 4.937873385763909e-06, "loss": 0.0024, "reward": 3.4646921157836914, "reward_std": 0.9867516160011292, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.614692211151123, "rewards/reward_search_strategy": 0.4750000238418579, "step": 164 }, { "completion_length": 543.0, "epoch": 0.5769230769230769, "grad_norm": 8.116938591003418, "kl": 4.153862953186035, "learning_rate": 4.935925161963089e-06, "loss": 0.1662, "reward": 4.612438678741455, "reward_std": 3.483085870742798, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5374384522438049, "rewards/reward_search_strategy": 0.699999988079071, "step": 165 }, { "completion_length": 378.5, "epoch": 0.5804195804195804, "grad_norm": 0.9611401557922363, "kl": 0.18193311989307404, "learning_rate": 4.933947257182901e-06, "loss": 0.0073, "reward": 4.473653316497803, "reward_std": 3.3241894245147705, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.19865311682224274, "rewards/reward_search_strategy": 0.7749999761581421, "step": 166 }, { "completion_length": 465.625, "epoch": 0.583916083916084, "grad_norm": 0.7452073097229004, "kl": 0.09764686226844788, "learning_rate": 4.9319396955234925e-06, "loss": 0.0039, "reward": 4.452567100524902, "reward_std": 3.8106887340545654, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4025672674179077, "rewards/reward_search_strategy": 0.550000011920929, "step": 167 }, { "completion_length": 613.75, "epoch": 0.5874125874125874, "grad_norm": 0.5454308390617371, "kl": 0.057498060166835785, "learning_rate": 4.9299025014463665e-06, "loss": 0.0023, "reward": 2.4831161499023438, "reward_std": 0.9839221239089966, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.43311628699302673, "rewards/reward_search_strategy": 0.550000011920929, "step": 168 }, { "completion_length": 654.25, "epoch": 0.5909090909090909, "grad_norm": 0.566487193107605, "kl": 0.04927007481455803, "learning_rate": 4.92783569977409e-06, "loss": 0.002, "reward": 3.5991201400756836, "reward_std": 2.8235316276550293, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.22412018477916718, "rewards/reward_search_strategy": 0.625, "step": 169 }, { "completion_length": 295.625, "epoch": 0.5944055944055944, "grad_norm": 1.1766833066940308, "kl": 0.24150727689266205, "learning_rate": 4.925739315689991e-06, "loss": 0.0097, "reward": 2.3266749382019043, "reward_std": 3.2796859741210938, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.27667519450187683, "rewards/reward_search_strategy": 0.30000001192092896, "step": 170 }, { "completion_length": 527.125, "epoch": 0.5979020979020979, "grad_norm": 0.6295068264007568, "kl": 0.08597421646118164, "learning_rate": 4.923613374737848e-06, "loss": 0.0034, "reward": 3.203831672668457, "reward_std": 2.0840423107147217, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6288318037986755, "rewards/reward_search_strategy": 0.574999988079071, "step": 171 }, { "completion_length": 423.5, "epoch": 0.6013986013986014, "grad_norm": 0.7889199256896973, "kl": 0.09207924455404282, "learning_rate": 4.921457902821578e-06, "loss": 0.0037, "reward": 3.5908405780792236, "reward_std": 2.698184013366699, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3158404231071472, "rewards/reward_search_strategy": 0.40000003576278687, "step": 172 }, { "completion_length": 515.875, "epoch": 0.6048951048951049, "grad_norm": 1.1472229957580566, "kl": 0.08138255774974823, "learning_rate": 4.9192729262049285e-06, "loss": 0.0033, "reward": 2.5571367740631104, "reward_std": 2.301767349243164, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30713677406311035, "rewards/reward_search_strategy": 0.25, "step": 173 }, { "completion_length": 380.5, "epoch": 0.6083916083916084, "grad_norm": 1.0713775157928467, "kl": 0.08360132575035095, "learning_rate": 4.917058471511149e-06, "loss": 0.0033, "reward": 2.460089683532715, "reward_std": 2.595458507537842, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.4350895881652832, "rewards/reward_search_strategy": 0.2750000059604645, "step": 174 }, { "completion_length": 529.25, "epoch": 0.6118881118881119, "grad_norm": 0.463553249835968, "kl": 0.05024154111742973, "learning_rate": 4.914814565722671e-06, "loss": 0.002, "reward": 3.3107078075408936, "reward_std": 2.1608943939208984, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48570770025253296, "rewards/reward_search_strategy": 0.5750000476837158, "step": 175 }, { "completion_length": 482.75, "epoch": 0.6153846153846154, "grad_norm": 1.2290676832199097, "kl": 0.14474555850028992, "learning_rate": 4.912541236180779e-06, "loss": 0.0058, "reward": 3.2196455001831055, "reward_std": 2.8522043228149414, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2946455180644989, "rewards/reward_search_strategy": 0.42500001192092896, "step": 176 }, { "completion_length": 423.75, "epoch": 0.6188811188811189, "grad_norm": 1.102494716644287, "kl": 0.059243083000183105, "learning_rate": 4.910238510585275e-06, "loss": 0.0024, "reward": 2.8200690746307373, "reward_std": 2.557361364364624, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39506906270980835, "rewards/reward_search_strategy": 0.42500001192092896, "step": 177 }, { "completion_length": 502.125, "epoch": 0.6223776223776224, "grad_norm": 1.207655668258667, "kl": 0.11078634113073349, "learning_rate": 4.907906416994146e-06, "loss": 0.0044, "reward": 4.589856147766113, "reward_std": 2.6245768070220947, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48985564708709717, "rewards/reward_search_strategy": 0.6000000238418579, "step": 178 }, { "completion_length": 508.875, "epoch": 0.6258741258741258, "grad_norm": 1.0890285968780518, "kl": 0.08423227816820145, "learning_rate": 4.905544983823214e-06, "loss": 0.0034, "reward": 2.421203374862671, "reward_std": 1.544132113456726, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.44620347023010254, "rewards/reward_search_strategy": 0.3500000238418579, "step": 179 }, { "completion_length": 332.875, "epoch": 0.6293706293706294, "grad_norm": 1.8258018493652344, "kl": 0.15480613708496094, "learning_rate": 4.903154239845798e-06, "loss": 0.0062, "reward": 1.375791311264038, "reward_std": 1.4795390367507935, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3257913589477539, "rewards/reward_search_strategy": 0.30000001192092896, "step": 180 }, { "completion_length": 312.25, "epoch": 0.6328671328671329, "grad_norm": 6.885568618774414, "kl": 0.5282591581344604, "learning_rate": 4.900734214192358e-06, "loss": 0.0211, "reward": 4.6441650390625, "reward_std": 3.3753483295440674, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5191651582717896, "rewards/reward_search_strategy": 0.5, "step": 181 }, { "completion_length": 442.375, "epoch": 0.6363636363636364, "grad_norm": 0.612838864326477, "kl": 0.08585107326507568, "learning_rate": 4.898284936350144e-06, "loss": 0.0034, "reward": 4.933065891265869, "reward_std": 3.642153739929199, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.7080655694007874, "rewards/reward_search_strategy": 0.4749999940395355, "step": 182 }, { "completion_length": 244.25, "epoch": 0.6398601398601399, "grad_norm": 1.8637006282806396, "kl": 0.38500577211380005, "learning_rate": 4.8958064361628334e-06, "loss": 0.0154, "reward": 6.256934642791748, "reward_std": 1.2352032661437988, "rewards/reward_correctness": 1.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4569346308708191, "rewards/reward_search_strategy": 0.30000001192092896, "step": 183 }, { "completion_length": 373.5, "epoch": 0.6433566433566433, "grad_norm": 1.1437357664108276, "kl": 0.12481357157230377, "learning_rate": 4.893298743830168e-06, "loss": 0.005, "reward": 3.356623411178589, "reward_std": 2.37040638923645, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5566233992576599, "rewards/reward_search_strategy": 0.30000001192092896, "step": 184 }, { "completion_length": 549.125, "epoch": 0.6468531468531469, "grad_norm": 0.6662846207618713, "kl": 0.10428506135940552, "learning_rate": 4.890761889907589e-06, "loss": 0.0042, "reward": 6.009631633758545, "reward_std": 3.1016297340393066, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7346314191818237, "rewards/reward_search_strategy": 0.6499999761581421, "step": 185 }, { "completion_length": 549.0, "epoch": 0.6503496503496503, "grad_norm": 0.7750718593597412, "kl": 0.09990722686052322, "learning_rate": 4.888195905305859e-06, "loss": 0.004, "reward": 4.673529624938965, "reward_std": 2.434964656829834, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5485298037528992, "rewards/reward_search_strategy": 0.625, "step": 186 }, { "completion_length": 531.625, "epoch": 0.6538461538461539, "grad_norm": 0.8020265102386475, "kl": 0.09699496626853943, "learning_rate": 4.885600821290692e-06, "loss": 0.0039, "reward": 3.3051586151123047, "reward_std": 1.4629502296447754, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4051586389541626, "rewards/reward_search_strategy": 0.4000000059604645, "step": 187 }, { "completion_length": 522.875, "epoch": 0.6573426573426573, "grad_norm": 1.3859021663665771, "kl": 0.13135036826133728, "learning_rate": 4.882976669482368e-06, "loss": 0.0053, "reward": 2.6884827613830566, "reward_std": 2.2438197135925293, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.388482928276062, "rewards/reward_search_strategy": 0.30000001192092896, "step": 188 }, { "completion_length": 518.5, "epoch": 0.6608391608391608, "grad_norm": 9.410249710083008, "kl": 0.6956667900085449, "learning_rate": 4.880323481855347e-06, "loss": 0.0278, "reward": 4.050230979919434, "reward_std": 3.249497652053833, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37523072957992554, "rewards/reward_search_strategy": 0.550000011920929, "step": 189 }, { "completion_length": 386.5, "epoch": 0.6643356643356644, "grad_norm": 0.8387676477432251, "kl": 0.08535090833902359, "learning_rate": 4.8776412907378845e-06, "loss": 0.0034, "reward": 4.588320255279541, "reward_std": 2.220287799835205, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.28832051157951355, "rewards/reward_search_strategy": 0.550000011920929, "step": 190 }, { "completion_length": 408.5, "epoch": 0.6678321678321678, "grad_norm": 2.006375551223755, "kl": 0.11828400194644928, "learning_rate": 4.874930128811631e-06, "loss": 0.0047, "reward": 3.0512070655822754, "reward_std": 2.421907424926758, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.30120688676834106, "rewards/reward_search_strategy": 0.5, "step": 191 }, { "completion_length": 245.75, "epoch": 0.6713286713286714, "grad_norm": 1.0472596883773804, "kl": 0.12134737521409988, "learning_rate": 4.8721900291112415e-06, "loss": 0.0049, "reward": 4.235280990600586, "reward_std": 3.679039716720581, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3602810502052307, "rewards/reward_search_strategy": 0.5, "step": 192 }, { "completion_length": 318.625, "epoch": 0.6748251748251748, "grad_norm": 1.2686798572540283, "kl": 0.17116394639015198, "learning_rate": 4.869421025023965e-06, "loss": 0.0068, "reward": 4.4517951011657715, "reward_std": 3.264738082885742, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5267948508262634, "rewards/reward_search_strategy": 0.42500001192092896, "step": 193 }, { "completion_length": 284.0, "epoch": 0.6783216783216783, "grad_norm": 1.0447121858596802, "kl": 0.2941432595252991, "learning_rate": 4.866623150289241e-06, "loss": 0.0118, "reward": 1.9646129608154297, "reward_std": 2.391249418258667, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.33961305022239685, "rewards/reward_search_strategy": 0.25, "step": 194 }, { "completion_length": 497.0, "epoch": 0.6818181818181818, "grad_norm": 0.604982852935791, "kl": 0.06880504637956619, "learning_rate": 4.863796438998293e-06, "loss": 0.0028, "reward": 6.05696964263916, "reward_std": 2.8059637546539307, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5319693684577942, "rewards/reward_search_strategy": 0.7749999761581421, "step": 195 }, { "completion_length": 322.625, "epoch": 0.6853146853146853, "grad_norm": 1.2860020399093628, "kl": 0.195574089884758, "learning_rate": 4.860940925593703e-06, "loss": 0.0078, "reward": 4.560000419616699, "reward_std": 2.813296318054199, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.31000030040740967, "rewards/reward_search_strategy": 0.375, "step": 196 }, { "completion_length": 536.75, "epoch": 0.6888111888111889, "grad_norm": 0.6069127321243286, "kl": 0.055065739899873734, "learning_rate": 4.858056644869002e-06, "loss": 0.0022, "reward": 4.489222049713135, "reward_std": 2.621090888977051, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5142220854759216, "rewards/reward_search_strategy": 0.6000000238418579, "step": 197 }, { "completion_length": 288.125, "epoch": 0.6923076923076923, "grad_norm": 1.4299031496047974, "kl": 0.28020644187927246, "learning_rate": 4.855143631968242e-06, "loss": 0.0112, "reward": 1.7020204067230225, "reward_std": 1.4690635204315186, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2770204544067383, "rewards/reward_search_strategy": 0.42500001192092896, "step": 198 }, { "completion_length": 451.625, "epoch": 0.6958041958041958, "grad_norm": 0.6012458205223083, "kl": 0.10064040124416351, "learning_rate": 4.852201922385564e-06, "loss": 0.004, "reward": 4.045350551605225, "reward_std": 2.6484973430633545, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5953505635261536, "rewards/reward_search_strategy": 0.5750000476837158, "step": 199 }, { "completion_length": 381.75, "epoch": 0.6993006993006993, "grad_norm": 1.712384819984436, "kl": 0.23387499153614044, "learning_rate": 4.849231551964771e-06, "loss": 0.0094, "reward": 6.612201690673828, "reward_std": 3.49477481842041, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4872019290924072, "rewards/reward_search_strategy": 0.75, "step": 200 }, { "completion_length": 489.0, "epoch": 0.7027972027972028, "grad_norm": 0.6931152939796448, "kl": 0.12770113348960876, "learning_rate": 4.84623255689889e-06, "loss": 0.0051, "reward": 4.282477378845215, "reward_std": 2.721886157989502, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5324774384498596, "rewards/reward_search_strategy": 0.5, "step": 201 }, { "completion_length": 308.75, "epoch": 0.7062937062937062, "grad_norm": 7.35214376449585, "kl": 2.9334309101104736, "learning_rate": 4.84320497372973e-06, "loss": 0.1173, "reward": 2.2045934200286865, "reward_std": 2.125786304473877, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4045933187007904, "rewards/reward_search_strategy": 0.42500001192092896, "step": 202 }, { "completion_length": 415.75, "epoch": 0.7097902097902098, "grad_norm": 0.819303035736084, "kl": 0.15163056552410126, "learning_rate": 4.840148839347434e-06, "loss": 0.0061, "reward": 5.433012962341309, "reward_std": 2.9335484504699707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7580130100250244, "rewards/reward_search_strategy": 0.675000011920929, "step": 203 }, { "completion_length": 259.625, "epoch": 0.7132867132867133, "grad_norm": 0.8853353261947632, "kl": 0.15020889043807983, "learning_rate": 4.837064190990036e-06, "loss": 0.006, "reward": 4.723485946655273, "reward_std": 3.2774343490600586, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.39848607778549194, "rewards/reward_search_strategy": 0.32499998807907104, "step": 204 }, { "completion_length": 263.75, "epoch": 0.7167832167832168, "grad_norm": 1.2585346698760986, "kl": 0.20474490523338318, "learning_rate": 4.833951066243004e-06, "loss": 0.0082, "reward": 4.4211530685424805, "reward_std": 3.1804075241088867, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.39615336060523987, "rewards/reward_search_strategy": 0.4000000059604645, "step": 205 }, { "completion_length": 335.5, "epoch": 0.7202797202797203, "grad_norm": 1.0650960206985474, "kl": 0.306348592042923, "learning_rate": 4.830809503038781e-06, "loss": 0.0123, "reward": 7.345704078674316, "reward_std": 1.786327838897705, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6957041621208191, "rewards/reward_search_strategy": 0.6499999761581421, "step": 206 }, { "completion_length": 380.125, "epoch": 0.7237762237762237, "grad_norm": 1.0272008180618286, "kl": 0.24504351615905762, "learning_rate": 4.8276395396563215e-06, "loss": 0.0098, "reward": 3.7895874977111816, "reward_std": 2.8140430450439453, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3645874857902527, "rewards/reward_search_strategy": 0.42500001192092896, "step": 207 }, { "completion_length": 452.75, "epoch": 0.7272727272727273, "grad_norm": 0.7511652112007141, "kl": 0.22063565254211426, "learning_rate": 4.824441214720629e-06, "loss": 0.0088, "reward": 4.4974045753479, "reward_std": 3.246708631515503, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3724047541618347, "rewards/reward_search_strategy": 0.625, "step": 208 }, { "completion_length": 259.5, "epoch": 0.7307692307692307, "grad_norm": 1.161224126815796, "kl": 0.4260641038417816, "learning_rate": 4.821214567202284e-06, "loss": 0.017, "reward": 5.784225940704346, "reward_std": 2.628154993057251, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.40922629833221436, "rewards/reward_search_strategy": 0.375, "step": 209 }, { "completion_length": 429.0, "epoch": 0.7342657342657343, "grad_norm": 1.5333750247955322, "kl": 0.3188992738723755, "learning_rate": 4.817959636416969e-06, "loss": 0.0128, "reward": 5.550121784210205, "reward_std": 3.3491060733795166, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4501221776008606, "rewards/reward_search_strategy": 0.7249999642372131, "step": 210 }, { "completion_length": 526.125, "epoch": 0.7377622377622378, "grad_norm": 0.5344757437705994, "kl": 0.07630512863397598, "learning_rate": 4.814676462024988e-06, "loss": 0.0031, "reward": 4.594638824462891, "reward_std": 2.2206883430480957, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5946387052536011, "rewards/reward_search_strategy": 0.625, "step": 211 }, { "completion_length": 362.75, "epoch": 0.7412587412587412, "grad_norm": 0.9454336762428284, "kl": 0.18136979639530182, "learning_rate": 4.811365084030784e-06, "loss": 0.0073, "reward": 2.3874056339263916, "reward_std": 2.5450692176818848, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4374057650566101, "rewards/reward_search_strategy": 0.44999998807907104, "step": 212 }, { "completion_length": 497.875, "epoch": 0.7447552447552448, "grad_norm": 0.5748926997184753, "kl": 0.0902019739151001, "learning_rate": 4.808025542782453e-06, "loss": 0.0036, "reward": 6.283139228820801, "reward_std": 3.231776237487793, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5081396102905273, "rewards/reward_search_strategy": 0.7749999761581421, "step": 213 }, { "completion_length": 498.375, "epoch": 0.7482517482517482, "grad_norm": 0.6200758218765259, "kl": 0.08524007350206375, "learning_rate": 4.804657878971252e-06, "loss": 0.0034, "reward": 3.962136745452881, "reward_std": 2.2499866485595703, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3621365427970886, "rewards/reward_search_strategy": 0.22500000894069672, "step": 214 }, { "completion_length": 382.75, "epoch": 0.7517482517482518, "grad_norm": 0.7669521570205688, "kl": 0.10141566395759583, "learning_rate": 4.801262133631101e-06, "loss": 0.0041, "reward": 4.497635841369629, "reward_std": 3.2545547485351562, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6476355791091919, "rewards/reward_search_strategy": 0.4750000238418579, "step": 215 }, { "completion_length": 465.0, "epoch": 0.7552447552447552, "grad_norm": 0.8276023268699646, "kl": 0.13839085400104523, "learning_rate": 4.7978383481380865e-06, "loss": 0.0055, "reward": 6.246551513671875, "reward_std": 2.7982473373413086, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4465520679950714, "rewards/reward_search_strategy": 0.550000011920929, "step": 216 }, { "completion_length": 348.875, "epoch": 0.7587412587412588, "grad_norm": 1.2766557931900024, "kl": 0.1600530445575714, "learning_rate": 4.794386564209953e-06, "loss": 0.0064, "reward": 3.271803855895996, "reward_std": 3.5930278301239014, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.37180379033088684, "rewards/reward_search_strategy": 0.4000000059604645, "step": 217 }, { "completion_length": 374.0, "epoch": 0.7622377622377622, "grad_norm": 0.9958794116973877, "kl": 0.15608493983745575, "learning_rate": 4.790906823905599e-06, "loss": 0.0062, "reward": 5.452359199523926, "reward_std": 2.1846044063568115, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6023593544960022, "rewards/reward_search_strategy": 0.3500000238418579, "step": 218 }, { "completion_length": 352.625, "epoch": 0.7657342657342657, "grad_norm": 0.7433205842971802, "kl": 0.08911055326461792, "learning_rate": 4.787399169624562e-06, "loss": 0.0036, "reward": 5.496169090270996, "reward_std": 3.098224639892578, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37116897106170654, "rewards/reward_search_strategy": 0.5, "step": 219 }, { "completion_length": 314.375, "epoch": 0.7692307692307693, "grad_norm": 1.3895384073257446, "kl": 0.14789187908172607, "learning_rate": 4.783863644106502e-06, "loss": 0.0059, "reward": 5.104283332824707, "reward_std": 3.202737808227539, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.42928358912467957, "rewards/reward_search_strategy": 0.550000011920929, "step": 220 }, { "completion_length": 483.0, "epoch": 0.7727272727272727, "grad_norm": 0.9903397560119629, "kl": 0.11351314932107925, "learning_rate": 4.780300290430683e-06, "loss": 0.0045, "reward": 3.8369758129119873, "reward_std": 2.3063299655914307, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4869758188724518, "rewards/reward_search_strategy": 0.6000000238418579, "step": 221 }, { "completion_length": 650.375, "epoch": 0.7762237762237763, "grad_norm": 1.8814777135849, "kl": 0.08025513589382172, "learning_rate": 4.776709152015443e-06, "loss": 0.0032, "reward": 3.4345803260803223, "reward_std": 2.4340741634368896, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.43458035588264465, "rewards/reward_search_strategy": 0.5, "step": 222 }, { "completion_length": 337.625, "epoch": 0.7797202797202797, "grad_norm": 1.796839952468872, "kl": 0.1309894621372223, "learning_rate": 4.773090272617672e-06, "loss": 0.0052, "reward": 4.828001976013184, "reward_std": 3.078373908996582, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5030019283294678, "rewards/reward_search_strategy": 0.45000001788139343, "step": 223 }, { "completion_length": 257.75, "epoch": 0.7832167832167832, "grad_norm": 1.1037311553955078, "kl": 0.0957639217376709, "learning_rate": 4.769443696332272e-06, "loss": 0.0038, "reward": 1.960368037223816, "reward_std": 2.3306467533111572, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.33536797761917114, "rewards/reward_search_strategy": 0.25, "step": 224 }, { "completion_length": 448.125, "epoch": 0.7867132867132867, "grad_norm": 0.759524405002594, "kl": 0.10344026237726212, "learning_rate": 4.765769467591626e-06, "loss": 0.0041, "reward": 6.857405662536621, "reward_std": 2.5283827781677246, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6074060201644897, "rewards/reward_search_strategy": 0.75, "step": 225 }, { "completion_length": 506.625, "epoch": 0.7902097902097902, "grad_norm": 1.952030062675476, "kl": 0.19945436716079712, "learning_rate": 4.762067631165049e-06, "loss": 0.008, "reward": 2.5767011642456055, "reward_std": 1.8916816711425781, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5267009735107422, "rewards/reward_search_strategy": 0.42500001192092896, "step": 226 }, { "completion_length": 805.5, "epoch": 0.7937062937062938, "grad_norm": 0.6654692888259888, "kl": 0.057463277131319046, "learning_rate": 4.7583382321582525e-06, "loss": 0.0023, "reward": 2.919649362564087, "reward_std": 2.193359375, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.44464927911758423, "rewards/reward_search_strategy": 0.4750000238418579, "step": 227 }, { "completion_length": 223.75, "epoch": 0.7972027972027972, "grad_norm": 1.6518536806106567, "kl": 0.19654996693134308, "learning_rate": 4.754581316012785e-06, "loss": 0.0079, "reward": 3.6179158687591553, "reward_std": 3.4085731506347656, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.26791587471961975, "rewards/reward_search_strategy": 0.3500000238418579, "step": 228 }, { "completion_length": 362.125, "epoch": 0.8006993006993007, "grad_norm": 1.1536964178085327, "kl": 0.10235659778118134, "learning_rate": 4.750796928505484e-06, "loss": 0.0041, "reward": 3.124476194381714, "reward_std": 2.4904367923736572, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.39947620034217834, "rewards/reward_search_strategy": 0.22500000894069672, "step": 229 }, { "completion_length": 428.25, "epoch": 0.8041958041958042, "grad_norm": 2.578925371170044, "kl": 0.25000184774398804, "learning_rate": 4.746985115747918e-06, "loss": 0.01, "reward": 4.44868278503418, "reward_std": 3.246079444885254, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.2986827492713928, "rewards/reward_search_strategy": 0.2750000059604645, "step": 230 }, { "completion_length": 518.625, "epoch": 0.8076923076923077, "grad_norm": 0.5841361284255981, "kl": 0.061389174312353134, "learning_rate": 4.743145924185821e-06, "loss": 0.0025, "reward": 4.760910511016846, "reward_std": 2.5773563385009766, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6859105229377747, "rewards/reward_search_strategy": 0.5750000476837158, "step": 231 }, { "completion_length": 446.0, "epoch": 0.8111888111888111, "grad_norm": 1.4337817430496216, "kl": 0.08883440494537354, "learning_rate": 4.7392794005985324e-06, "loss": 0.0036, "reward": 5.19779109954834, "reward_std": 2.8020260334014893, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4727913737297058, "rewards/reward_search_strategy": 0.5999999642372131, "step": 232 }, { "completion_length": 342.0, "epoch": 0.8146853146853147, "grad_norm": 9.370593070983887, "kl": 1.3528209924697876, "learning_rate": 4.735385592098421e-06, "loss": 0.0541, "reward": 3.8530240058898926, "reward_std": 3.4489855766296387, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3780239224433899, "rewards/reward_search_strategy": 0.3500000238418579, "step": 233 }, { "completion_length": 552.375, "epoch": 0.8181818181818182, "grad_norm": 0.5406525731086731, "kl": 0.06817281246185303, "learning_rate": 4.731464546130315e-06, "loss": 0.0027, "reward": 6.135942459106445, "reward_std": 2.168360471725464, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.48594266176223755, "rewards/reward_search_strategy": 0.6500000357627869, "step": 234 }, { "completion_length": 182.125, "epoch": 0.8216783216783217, "grad_norm": 1.1569007635116577, "kl": 0.17475339770317078, "learning_rate": 4.72751631047092e-06, "loss": 0.007, "reward": 3.322808265686035, "reward_std": 4.106693267822266, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2228086292743683, "rewards/reward_search_strategy": 0.3500000238418579, "step": 235 }, { "completion_length": 576.0, "epoch": 0.8251748251748252, "grad_norm": 0.7848924994468689, "kl": 0.09483487904071808, "learning_rate": 4.723540933228245e-06, "loss": 0.0038, "reward": 5.85687255859375, "reward_std": 3.493412733078003, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5068723559379578, "rewards/reward_search_strategy": 0.5999999642372131, "step": 236 }, { "completion_length": 435.125, "epoch": 0.8286713286713286, "grad_norm": 0.9283254146575928, "kl": 0.09187249094247818, "learning_rate": 4.719538462841003e-06, "loss": 0.0037, "reward": 3.7286319732666016, "reward_std": 1.7196377515792847, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.253632128238678, "rewards/reward_search_strategy": 0.3500000238418579, "step": 237 }, { "completion_length": 228.5, "epoch": 0.8321678321678322, "grad_norm": 1.0967156887054443, "kl": 0.11720684170722961, "learning_rate": 4.715508948078037e-06, "loss": 0.0047, "reward": 3.382150650024414, "reward_std": 3.5990474224090576, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.28215059638023376, "rewards/reward_search_strategy": 0.3500000238418579, "step": 238 }, { "completion_length": 315.0, "epoch": 0.8356643356643356, "grad_norm": 0.8198094367980957, "kl": 0.10767393559217453, "learning_rate": 4.71145243803771e-06, "loss": 0.0043, "reward": 3.447524070739746, "reward_std": 2.665365695953369, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4225241541862488, "rewards/reward_search_strategy": 0.2750000059604645, "step": 239 }, { "completion_length": 439.5, "epoch": 0.8391608391608392, "grad_norm": 0.6438246965408325, "kl": 0.0823763906955719, "learning_rate": 4.707368982147318e-06, "loss": 0.0033, "reward": 4.2677226066589355, "reward_std": 2.0768139362335205, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6177226901054382, "rewards/reward_search_strategy": 0.6499999761581421, "step": 240 }, { "completion_length": 452.625, "epoch": 0.8426573426573427, "grad_norm": 0.8286120295524597, "kl": 0.13944250345230103, "learning_rate": 4.703258630162481e-06, "loss": 0.0056, "reward": 6.840209007263184, "reward_std": 2.0674614906311035, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5402096509933472, "rewards/reward_search_strategy": 0.675000011920929, "step": 241 }, { "completion_length": 425.125, "epoch": 0.8461538461538461, "grad_norm": 0.6866943836212158, "kl": 0.08232449740171432, "learning_rate": 4.699121432166542e-06, "loss": 0.0033, "reward": 4.0894389152526855, "reward_std": 3.227137327194214, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.21443882584571838, "rewards/reward_search_strategy": 0.5, "step": 242 }, { "completion_length": 318.5, "epoch": 0.8496503496503497, "grad_norm": 1.1648399829864502, "kl": 0.19596882164478302, "learning_rate": 4.6949574385699514e-06, "loss": 0.0078, "reward": 3.9066293239593506, "reward_std": 3.0713882446289062, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3316292464733124, "rewards/reward_search_strategy": 0.44999998807907104, "step": 243 }, { "completion_length": 338.0, "epoch": 0.8531468531468531, "grad_norm": 45.41154479980469, "kl": 11.361538887023926, "learning_rate": 4.690766700109659e-06, "loss": 0.4545, "reward": 4.442761421203613, "reward_std": 3.3356175422668457, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29276183247566223, "rewards/reward_search_strategy": 0.3999999761581421, "step": 244 }, { "completion_length": 272.375, "epoch": 0.8566433566433567, "grad_norm": 0.975391149520874, "kl": 0.0915113240480423, "learning_rate": 4.68654926784849e-06, "loss": 0.0037, "reward": 5.3376922607421875, "reward_std": 2.7289388179779053, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6126920580863953, "rewards/reward_search_strategy": 0.7250000238418579, "step": 245 }, { "completion_length": 328.0, "epoch": 0.8601398601398601, "grad_norm": 0.8504089117050171, "kl": 0.09103550016880035, "learning_rate": 4.682305193174524e-06, "loss": 0.0036, "reward": 6.278281211853027, "reward_std": 3.407573699951172, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5032811164855957, "rewards/reward_search_strategy": 0.6499999761581421, "step": 246 }, { "completion_length": 503.375, "epoch": 0.8636363636363636, "grad_norm": 0.8787162899971008, "kl": 0.08089443296194077, "learning_rate": 4.6780345278004744e-06, "loss": 0.0032, "reward": 5.4692606925964355, "reward_std": 3.3828439712524414, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4442606270313263, "rewards/reward_search_strategy": 0.5249999761581421, "step": 247 }, { "completion_length": 545.25, "epoch": 0.8671328671328671, "grad_norm": 0.5359931588172913, "kl": 0.12629981338977814, "learning_rate": 4.673737323763048e-06, "loss": 0.0051, "reward": 4.020110607147217, "reward_std": 2.231464147567749, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6951106786727905, "rewards/reward_search_strategy": 0.7000000476837158, "step": 248 }, { "completion_length": 406.625, "epoch": 0.8706293706293706, "grad_norm": 5.756573677062988, "kl": 4.332620143890381, "learning_rate": 4.669413633422322e-06, "loss": 0.1733, "reward": 4.411682605743408, "reward_std": 2.8709969520568848, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4866824746131897, "rewards/reward_search_strategy": 0.675000011920929, "step": 249 }, { "completion_length": 659.625, "epoch": 0.8741258741258742, "grad_norm": 0.5628019571304321, "kl": 0.06190004199743271, "learning_rate": 4.665063509461098e-06, "loss": 0.0025, "reward": 5.015512466430664, "reward_std": 2.039670944213867, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6405123472213745, "rewards/reward_search_strategy": 0.625, "step": 250 }, { "completion_length": 376.0, "epoch": 0.8776223776223776, "grad_norm": 0.6614730358123779, "kl": 0.10508718341588974, "learning_rate": 4.6606870048842626e-06, "loss": 0.0042, "reward": 4.854657173156738, "reward_std": 3.0948827266693115, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4296573996543884, "rewards/reward_search_strategy": 0.550000011920929, "step": 251 }, { "completion_length": 352.5, "epoch": 0.8811188811188811, "grad_norm": 1.1718766689300537, "kl": 0.10988699644804001, "learning_rate": 4.656284173018144e-06, "loss": 0.0044, "reward": 4.657190322875977, "reward_std": 3.0112640857696533, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.40719035267829895, "rewards/reward_search_strategy": 0.5, "step": 252 }, { "completion_length": 596.875, "epoch": 0.8846153846153846, "grad_norm": 0.6592712998390198, "kl": 0.08809557557106018, "learning_rate": 4.65185506750986e-06, "loss": 0.0035, "reward": 4.057185649871826, "reward_std": 2.201416015625, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6321854591369629, "rewards/reward_search_strategy": 0.550000011920929, "step": 253 }, { "completion_length": 357.625, "epoch": 0.8881118881118881, "grad_norm": 1.0848559141159058, "kl": 0.17698736488819122, "learning_rate": 4.6473997423266615e-06, "loss": 0.0071, "reward": 2.2025272846221924, "reward_std": 2.108583450317383, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.45252731442451477, "rewards/reward_search_strategy": 0.375, "step": 254 }, { "completion_length": 423.375, "epoch": 0.8916083916083916, "grad_norm": 1.3164327144622803, "kl": 0.20678985118865967, "learning_rate": 4.642918251755281e-06, "loss": 0.0083, "reward": 4.640832901000977, "reward_std": 3.4112045764923096, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5658329129219055, "rewards/reward_search_strategy": 0.44999998807907104, "step": 255 }, { "completion_length": 480.875, "epoch": 0.8951048951048951, "grad_norm": 0.9758729934692383, "kl": 0.12582933902740479, "learning_rate": 4.638410650401267e-06, "loss": 0.005, "reward": 3.9991414546966553, "reward_std": 2.9747655391693115, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4241415858268738, "rewards/reward_search_strategy": 0.45000001788139343, "step": 256 }, { "completion_length": 377.875, "epoch": 0.8986013986013986, "grad_norm": 0.7494639754295349, "kl": 0.12961195409297943, "learning_rate": 4.633876993188319e-06, "loss": 0.0052, "reward": 5.370232582092285, "reward_std": 2.715336322784424, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29523247480392456, "rewards/reward_search_strategy": 0.574999988079071, "step": 257 }, { "completion_length": 446.375, "epoch": 0.9020979020979021, "grad_norm": 0.9032400250434875, "kl": 0.12136492133140564, "learning_rate": 4.62931733535762e-06, "loss": 0.0049, "reward": 1.9620543718338013, "reward_std": 1.8072302341461182, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3620542883872986, "rewards/reward_search_strategy": 0.3500000238418579, "step": 258 }, { "completion_length": 461.625, "epoch": 0.9055944055944056, "grad_norm": 0.8652839660644531, "kl": 0.09421462565660477, "learning_rate": 4.62473173246716e-06, "loss": 0.0038, "reward": 4.801279067993164, "reward_std": 3.08664870262146, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5762789845466614, "rewards/reward_search_strategy": 0.6000000238418579, "step": 259 }, { "completion_length": 373.375, "epoch": 0.9090909090909091, "grad_norm": 1.3319971561431885, "kl": 0.19203314185142517, "learning_rate": 4.620120240391065e-06, "loss": 0.0077, "reward": 3.6981348991394043, "reward_std": 2.8933234214782715, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4981347918510437, "rewards/reward_search_strategy": 0.45000001788139343, "step": 260 }, { "completion_length": 567.875, "epoch": 0.9125874125874126, "grad_norm": 0.39829006791114807, "kl": 0.08746462315320969, "learning_rate": 4.6154829153189105e-06, "loss": 0.0035, "reward": 4.194956302642822, "reward_std": 1.963258981704712, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4949561357498169, "rewards/reward_search_strategy": 0.699999988079071, "step": 261 }, { "completion_length": 493.375, "epoch": 0.916083916083916, "grad_norm": 0.7264673113822937, "kl": 0.09607839584350586, "learning_rate": 4.610819813755038e-06, "loss": 0.0038, "reward": 3.0513057708740234, "reward_std": 1.9612208604812622, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.45130595564842224, "rewards/reward_search_strategy": 0.6000000238418579, "step": 262 }, { "completion_length": 447.25, "epoch": 0.9195804195804196, "grad_norm": 1.1438530683517456, "kl": 0.12347482144832611, "learning_rate": 4.60613099251787e-06, "loss": 0.0049, "reward": 5.763233184814453, "reward_std": 2.4886815547943115, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4132331609725952, "rewards/reward_search_strategy": 0.7250000238418579, "step": 263 }, { "completion_length": 428.375, "epoch": 0.9230769230769231, "grad_norm": 0.8694964051246643, "kl": 0.1385030597448349, "learning_rate": 4.601416508739211e-06, "loss": 0.0055, "reward": 3.2145180702209473, "reward_std": 2.9202535152435303, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3395180106163025, "rewards/reward_search_strategy": 0.625, "step": 264 }, { "completion_length": 381.125, "epoch": 0.9265734265734266, "grad_norm": 1.3948837518692017, "kl": 0.1681864708662033, "learning_rate": 4.596676419863561e-06, "loss": 0.0067, "reward": 6.393776893615723, "reward_std": 3.190995693206787, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.643777072429657, "rewards/reward_search_strategy": 0.75, "step": 265 }, { "completion_length": 394.75, "epoch": 0.9300699300699301, "grad_norm": 0.7370116710662842, "kl": 0.12830950319766998, "learning_rate": 4.591910783647405e-06, "loss": 0.0051, "reward": 5.736630439758301, "reward_std": 3.0993423461914062, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4366300702095032, "rewards/reward_search_strategy": 0.675000011920929, "step": 266 }, { "completion_length": 525.875, "epoch": 0.9335664335664335, "grad_norm": 0.7589148283004761, "kl": 0.11880780756473541, "learning_rate": 4.587119658158517e-06, "loss": 0.0048, "reward": 2.8846940994262695, "reward_std": 1.8504923582077026, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3846941590309143, "rewards/reward_search_strategy": 0.375, "step": 267 }, { "completion_length": 556.625, "epoch": 0.9370629370629371, "grad_norm": 0.7502413392066956, "kl": 0.13426969945430756, "learning_rate": 4.582303101775249e-06, "loss": 0.0054, "reward": 3.693828582763672, "reward_std": 2.5637710094451904, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.41882866621017456, "rewards/reward_search_strategy": 0.5249999761581421, "step": 268 }, { "completion_length": 387.125, "epoch": 0.9405594405594405, "grad_norm": 1.2606431245803833, "kl": 0.15010693669319153, "learning_rate": 4.577461173185821e-06, "loss": 0.006, "reward": 5.423138618469238, "reward_std": 2.955420970916748, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5231384038925171, "rewards/reward_search_strategy": 0.5250000357627869, "step": 269 }, { "completion_length": 544.5, "epoch": 0.9440559440559441, "grad_norm": 117.32878875732422, "kl": 7.282515525817871, "learning_rate": 4.572593931387604e-06, "loss": 0.2913, "reward": 3.634967088699341, "reward_std": 1.6427184343338013, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5599672198295593, "rewards/reward_search_strategy": 0.45000001788139343, "step": 270 }, { "completion_length": 344.625, "epoch": 0.9475524475524476, "grad_norm": 1.2033367156982422, "kl": 0.4850043058395386, "learning_rate": 4.567701435686405e-06, "loss": 0.0194, "reward": 2.6374354362487793, "reward_std": 3.4561848640441895, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.23743543028831482, "rewards/reward_search_strategy": 0.2750000059604645, "step": 271 }, { "completion_length": 450.25, "epoch": 0.951048951048951, "grad_norm": 9.75650691986084, "kl": 4.285886287689209, "learning_rate": 4.562783745695738e-06, "loss": 0.1714, "reward": 2.6909584999084473, "reward_std": 1.9044957160949707, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.490958571434021, "rewards/reward_search_strategy": 0.44999998807907104, "step": 272 }, { "completion_length": 308.375, "epoch": 0.9545454545454546, "grad_norm": 1.5673211812973022, "kl": 0.31804680824279785, "learning_rate": 4.5578409213361055e-06, "loss": 0.0127, "reward": 4.503492832183838, "reward_std": 3.3028485774993896, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5284927487373352, "rewards/reward_search_strategy": 0.4749999940395355, "step": 273 }, { "completion_length": 314.375, "epoch": 0.958041958041958, "grad_norm": 1.0376074314117432, "kl": 0.23754160106182098, "learning_rate": 4.55287302283426e-06, "loss": 0.0095, "reward": 5.486078262329102, "reward_std": 2.5428996086120605, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3610779643058777, "rewards/reward_search_strategy": 0.375, "step": 274 }, { "completion_length": 483.375, "epoch": 0.9615384615384616, "grad_norm": 0.7041484713554382, "kl": 0.15638189017772675, "learning_rate": 4.54788011072248e-06, "loss": 0.0063, "reward": 7.443948745727539, "reward_std": 2.539414882659912, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7939489483833313, "rewards/reward_search_strategy": 0.8999999761581421, "step": 275 }, { "completion_length": 466.75, "epoch": 0.965034965034965, "grad_norm": 0.618880033493042, "kl": 0.132215678691864, "learning_rate": 4.542862245837821e-06, "loss": 0.0053, "reward": 3.8898563385009766, "reward_std": 2.4422481060028076, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.46485650539398193, "rewards/reward_search_strategy": 0.675000011920929, "step": 276 }, { "completion_length": 380.625, "epoch": 0.9685314685314685, "grad_norm": 2.440429210662842, "kl": 0.22437606751918793, "learning_rate": 4.537819489321385e-06, "loss": 0.009, "reward": 3.1129302978515625, "reward_std": 2.365227460861206, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.36293041706085205, "rewards/reward_search_strategy": 0.5, "step": 277 }, { "completion_length": 574.75, "epoch": 0.972027972027972, "grad_norm": 0.5336410999298096, "kl": 0.10619538277387619, "learning_rate": 4.5327519026175694e-06, "loss": 0.0042, "reward": 5.400971412658691, "reward_std": 2.051018714904785, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5259711742401123, "rewards/reward_search_strategy": 0.625, "step": 278 }, { "completion_length": 434.75, "epoch": 0.9755244755244755, "grad_norm": 0.6269961595535278, "kl": 0.12009799480438232, "learning_rate": 4.527659547473317e-06, "loss": 0.0048, "reward": 6.216640949249268, "reward_std": 2.1789255142211914, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49164125323295593, "rewards/reward_search_strategy": 0.6000000238418579, "step": 279 }, { "completion_length": 352.75, "epoch": 0.9790209790209791, "grad_norm": 1.62322199344635, "kl": 0.16065241396427155, "learning_rate": 4.522542485937369e-06, "loss": 0.0064, "reward": 6.019310474395752, "reward_std": 3.3751046657562256, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.36931055784225464, "rewards/reward_search_strategy": 0.7749999761581421, "step": 280 }, { "completion_length": 321.25, "epoch": 0.9825174825174825, "grad_norm": 1.4410455226898193, "kl": 0.16992495954036713, "learning_rate": 4.517400780359505e-06, "loss": 0.0068, "reward": 6.315744400024414, "reward_std": 3.2042458057403564, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.515744686126709, "rewards/reward_search_strategy": 0.6749999523162842, "step": 281 }, { "completion_length": 531.75, "epoch": 0.986013986013986, "grad_norm": 0.589606761932373, "kl": 0.09500578045845032, "learning_rate": 4.512234493389785e-06, "loss": 0.0038, "reward": 5.298929214477539, "reward_std": 2.658466339111328, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6739287376403809, "rewards/reward_search_strategy": 0.625, "step": 282 }, { "completion_length": 408.125, "epoch": 0.9895104895104895, "grad_norm": 2.672626256942749, "kl": 0.25274914503097534, "learning_rate": 4.507043687977787e-06, "loss": 0.0101, "reward": 5.0886993408203125, "reward_std": 2.9135916233062744, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.23869916796684265, "rewards/reward_search_strategy": 0.6000000238418579, "step": 283 }, { "completion_length": 496.5, "epoch": 0.993006993006993, "grad_norm": 0.5412589907646179, "kl": 0.12916360795497894, "learning_rate": 4.501828427371834e-06, "loss": 0.0052, "reward": 3.9245102405548096, "reward_std": 2.256361484527588, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.6245103478431702, "rewards/reward_search_strategy": 0.550000011920929, "step": 284 }, { "completion_length": 400.5, "epoch": 0.9965034965034965, "grad_norm": 0.8593103289604187, "kl": 0.131291002035141, "learning_rate": 4.496588775118232e-06, "loss": 0.0053, "reward": 5.690485954284668, "reward_std": 3.571183204650879, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.46548622846603394, "rewards/reward_search_strategy": 0.6000000238418579, "step": 285 }, { "completion_length": 564.25, "epoch": 1.0, "grad_norm": 0.6153562664985657, "kl": 0.09146779030561447, "learning_rate": 4.491324795060491e-06, "loss": 0.0037, "reward": 6.105203628540039, "reward_std": 2.1778714656829834, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6802035570144653, "rewards/reward_search_strategy": 0.7999999523162842, "step": 286 }, { "completion_length": 334.25, "epoch": 1.0034965034965035, "grad_norm": 1.5982016324996948, "kl": 0.27556726336479187, "learning_rate": 4.4860365513385456e-06, "loss": 0.011, "reward": 6.495705604553223, "reward_std": 3.424804925918579, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.37070590257644653, "rewards/reward_search_strategy": 0.75, "step": 287 }, { "completion_length": 327.5, "epoch": 1.006993006993007, "grad_norm": 0.9249085187911987, "kl": 0.13310858607292175, "learning_rate": 4.4807241083879774e-06, "loss": 0.0053, "reward": 4.291748523712158, "reward_std": 3.5267059803009033, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4167482554912567, "rewards/reward_search_strategy": 0.625, "step": 288 }, { "completion_length": 353.375, "epoch": 1.0104895104895104, "grad_norm": 1.0354520082473755, "kl": 0.13781802356243134, "learning_rate": 4.475387530939226e-06, "loss": 0.0055, "reward": 6.14892578125, "reward_std": 3.4906580448150635, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5739257335662842, "rewards/reward_search_strategy": 0.699999988079071, "step": 289 }, { "completion_length": 442.0, "epoch": 1.013986013986014, "grad_norm": 0.5650519132614136, "kl": 0.09590885043144226, "learning_rate": 4.470026884016805e-06, "loss": 0.0038, "reward": 6.06260347366333, "reward_std": 2.7577667236328125, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5376033782958984, "rewards/reward_search_strategy": 0.7749999761581421, "step": 290 }, { "completion_length": 442.0, "epoch": 1.0174825174825175, "grad_norm": 0.5313113331794739, "kl": 0.10662511736154556, "learning_rate": 4.464642232938505e-06, "loss": 0.0043, "reward": 5.261750221252441, "reward_std": 2.6503186225891113, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48675042390823364, "rewards/reward_search_strategy": 0.5249999761581421, "step": 291 }, { "completion_length": 182.5, "epoch": 1.020979020979021, "grad_norm": 1.577793002128601, "kl": 0.33014675974845886, "learning_rate": 4.4592336433146e-06, "loss": 0.0132, "reward": 3.2789976596832275, "reward_std": 2.985043525695801, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.1039976254105568, "rewards/reward_search_strategy": 0.42500001192092896, "step": 292 }, { "completion_length": 331.125, "epoch": 1.0244755244755244, "grad_norm": 0.7017413973808289, "kl": 0.1622174084186554, "learning_rate": 4.453801181047047e-06, "loss": 0.0065, "reward": 7.63859224319458, "reward_std": 1.5380568504333496, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6885923147201538, "rewards/reward_search_strategy": 0.7000000476837158, "step": 293 }, { "completion_length": 438.125, "epoch": 1.027972027972028, "grad_norm": 2.848461866378784, "kl": 0.12165224552154541, "learning_rate": 4.448344912328686e-06, "loss": 0.0049, "reward": 3.944343090057373, "reward_std": 2.368406057357788, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.2943430542945862, "rewards/reward_search_strategy": 0.6500000357627869, "step": 294 }, { "completion_length": 472.125, "epoch": 1.0314685314685315, "grad_norm": 1.3413972854614258, "kl": 0.1494167149066925, "learning_rate": 4.442864903642428e-06, "loss": 0.006, "reward": 4.690557956695557, "reward_std": 2.62575626373291, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49055802822113037, "rewards/reward_search_strategy": 0.574999988079071, "step": 295 }, { "completion_length": 493.125, "epoch": 1.034965034965035, "grad_norm": 1.2033652067184448, "kl": 0.17120806872844696, "learning_rate": 4.437361221760449e-06, "loss": 0.0068, "reward": 5.665086269378662, "reward_std": 3.6448543071746826, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.46508604288101196, "rewards/reward_search_strategy": 0.574999988079071, "step": 296 }, { "completion_length": 594.125, "epoch": 1.0384615384615385, "grad_norm": 0.5311799049377441, "kl": 0.07367473095655441, "learning_rate": 4.431833933743378e-06, "loss": 0.0029, "reward": 4.792994499206543, "reward_std": 1.6828709840774536, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5179944634437561, "rewards/reward_search_strategy": 0.5249999761581421, "step": 297 }, { "completion_length": 528.75, "epoch": 1.0419580419580419, "grad_norm": 0.5492209196090698, "kl": 0.13790830969810486, "learning_rate": 4.426283106939474e-06, "loss": 0.0055, "reward": 5.346704959869385, "reward_std": 2.976729393005371, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6967048645019531, "rewards/reward_search_strategy": 0.6500000357627869, "step": 298 }, { "completion_length": 352.0, "epoch": 1.0454545454545454, "grad_norm": 6.345902919769287, "kl": 8.095924377441406, "learning_rate": 4.420708808983809e-06, "loss": 0.3238, "reward": 6.351449966430664, "reward_std": 3.353184938430786, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5514500141143799, "rewards/reward_search_strategy": 0.675000011920929, "step": 299 }, { "completion_length": 431.25, "epoch": 1.048951048951049, "grad_norm": 0.7204697132110596, "kl": 0.13538429141044617, "learning_rate": 4.415111107797445e-06, "loss": 0.0054, "reward": 5.675325393676758, "reward_std": 1.948087453842163, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6003252267837524, "rewards/reward_search_strategy": 0.5750000476837158, "step": 300 }, { "completion_length": 329.0, "epoch": 1.0524475524475525, "grad_norm": 0.9927306175231934, "kl": 0.12998303771018982, "learning_rate": 4.409490071586606e-06, "loss": 0.0052, "reward": 4.4069647789001465, "reward_std": 3.4473555088043213, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.2819647789001465, "rewards/reward_search_strategy": 0.625, "step": 301 }, { "completion_length": 307.75, "epoch": 1.055944055944056, "grad_norm": 1.6803146600723267, "kl": 0.1856156587600708, "learning_rate": 4.403845768841842e-06, "loss": 0.0074, "reward": 4.0473809242248535, "reward_std": 3.1361262798309326, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3223809003829956, "rewards/reward_search_strategy": 0.4750000238418579, "step": 302 }, { "completion_length": 539.0, "epoch": 1.0594405594405594, "grad_norm": 0.5144261717796326, "kl": 0.09425308555364609, "learning_rate": 4.398178268337202e-06, "loss": 0.0038, "reward": 5.513421535491943, "reward_std": 2.786677837371826, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5884215831756592, "rewards/reward_search_strategy": 0.550000011920929, "step": 303 }, { "completion_length": 483.625, "epoch": 1.062937062937063, "grad_norm": 0.6803086400032043, "kl": 0.09594997018575668, "learning_rate": 4.3924876391293915e-06, "loss": 0.0038, "reward": 5.4244794845581055, "reward_std": 2.6390738487243652, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5994799733161926, "rewards/reward_search_strategy": 0.574999988079071, "step": 304 }, { "completion_length": 559.25, "epoch": 1.0664335664335665, "grad_norm": 0.5547813773155212, "kl": 0.1060553565621376, "learning_rate": 4.386773950556931e-06, "loss": 0.0042, "reward": 5.821169853210449, "reward_std": 2.7936391830444336, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7211699485778809, "rewards/reward_search_strategy": 0.7249999642372131, "step": 305 }, { "completion_length": 580.375, "epoch": 1.06993006993007, "grad_norm": 0.5855110883712769, "kl": 0.14783963561058044, "learning_rate": 4.381037272239311e-06, "loss": 0.0059, "reward": 6.188364028930664, "reward_std": 2.4008665084838867, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5883639454841614, "rewards/reward_search_strategy": 0.6000000238418579, "step": 306 }, { "completion_length": 447.125, "epoch": 1.0734265734265733, "grad_norm": 1.4545259475708008, "kl": 0.2002858966588974, "learning_rate": 4.3752776740761495e-06, "loss": 0.008, "reward": 5.376095771789551, "reward_std": 3.676356315612793, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.576095700263977, "rewards/reward_search_strategy": 0.675000011920929, "step": 307 }, { "completion_length": 539.5, "epoch": 1.0769230769230769, "grad_norm": 11.574174880981445, "kl": 2.8931026458740234, "learning_rate": 4.36949522624633e-06, "loss": 0.1157, "reward": 4.711579322814941, "reward_std": 2.978994131088257, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.7115795612335205, "rewards/reward_search_strategy": 0.625, "step": 308 }, { "completion_length": 297.625, "epoch": 1.0804195804195804, "grad_norm": 2.2476742267608643, "kl": 0.18139171600341797, "learning_rate": 4.3636899992071555e-06, "loss": 0.0073, "reward": 7.508697986602783, "reward_std": 3.040250062942505, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5586981773376465, "rewards/reward_search_strategy": 0.824999988079071, "step": 309 }, { "completion_length": 361.625, "epoch": 1.083916083916084, "grad_norm": 0.5807898640632629, "kl": 0.139273002743721, "learning_rate": 4.357862063693486e-06, "loss": 0.0056, "reward": 6.986447334289551, "reward_std": 2.1157562732696533, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5114474296569824, "rewards/reward_search_strategy": 0.7250000238418579, "step": 310 }, { "completion_length": 567.875, "epoch": 1.0874125874125875, "grad_norm": 0.5602300763130188, "kl": 0.1023474633693695, "learning_rate": 4.352011490716875e-06, "loss": 0.0041, "reward": 5.648248672485352, "reward_std": 2.6205856800079346, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5982489585876465, "rewards/reward_search_strategy": 0.675000011920929, "step": 311 }, { "completion_length": 401.5, "epoch": 1.0909090909090908, "grad_norm": 0.6656911969184875, "kl": 0.10482494533061981, "learning_rate": 4.346138351564711e-06, "loss": 0.0042, "reward": 5.354951858520508, "reward_std": 2.9195470809936523, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6049516201019287, "rewards/reward_search_strategy": 0.75, "step": 312 }, { "completion_length": 490.5, "epoch": 1.0944055944055944, "grad_norm": 0.6362817287445068, "kl": 0.07986684143543243, "learning_rate": 4.340242717799337e-06, "loss": 0.0032, "reward": 5.589838981628418, "reward_std": 2.4936347007751465, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.41483867168426514, "rewards/reward_search_strategy": 0.675000011920929, "step": 313 }, { "completion_length": 395.25, "epoch": 1.097902097902098, "grad_norm": 1.8932520151138306, "kl": 0.1983058750629425, "learning_rate": 4.334324661257191e-06, "loss": 0.0079, "reward": 4.26251220703125, "reward_std": 3.2981317043304443, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43751221895217896, "rewards/reward_search_strategy": 0.5750000476837158, "step": 314 }, { "completion_length": 454.0, "epoch": 1.1013986013986015, "grad_norm": 0.5324206352233887, "kl": 0.08979379385709763, "learning_rate": 4.328384254047927e-06, "loss": 0.0036, "reward": 7.345250129699707, "reward_std": 1.8361765146255493, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3702499270439148, "rewards/reward_search_strategy": 0.7250000238418579, "step": 315 }, { "completion_length": 269.625, "epoch": 1.104895104895105, "grad_norm": 1.358657717704773, "kl": 0.16482065618038177, "learning_rate": 4.322421568553529e-06, "loss": 0.0066, "reward": 5.7101945877075195, "reward_std": 2.8660097122192383, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3851943910121918, "rewards/reward_search_strategy": 0.574999988079071, "step": 316 }, { "completion_length": 426.0, "epoch": 1.1083916083916083, "grad_norm": 0.8824137449264526, "kl": 0.20691236853599548, "learning_rate": 4.316436677427441e-06, "loss": 0.0083, "reward": 5.294474124908447, "reward_std": 2.200099229812622, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5194741487503052, "rewards/reward_search_strategy": 0.6499999761581421, "step": 317 }, { "completion_length": 449.5, "epoch": 1.1118881118881119, "grad_norm": 0.6172366738319397, "kl": 0.07651077210903168, "learning_rate": 4.3104296535936695e-06, "loss": 0.0031, "reward": 4.2370381355285645, "reward_std": 2.9659953117370605, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4870380759239197, "rewards/reward_search_strategy": 0.5, "step": 318 }, { "completion_length": 535.75, "epoch": 1.1153846153846154, "grad_norm": 0.6779645085334778, "kl": 0.08398789167404175, "learning_rate": 4.3044005702459055e-06, "loss": 0.0034, "reward": 5.004274845123291, "reward_std": 2.3385448455810547, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4542747437953949, "rewards/reward_search_strategy": 0.5500000715255737, "step": 319 }, { "completion_length": 401.25, "epoch": 1.118881118881119, "grad_norm": 2.2599902153015137, "kl": 0.16237607598304749, "learning_rate": 4.2983495008466285e-06, "loss": 0.0065, "reward": 5.627570629119873, "reward_std": 3.1766750812530518, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3525705933570862, "rewards/reward_search_strategy": 0.7749999761581421, "step": 320 }, { "completion_length": 345.0, "epoch": 1.1223776223776223, "grad_norm": 0.6874108910560608, "kl": 0.07024817913770676, "learning_rate": 4.2922765191262075e-06, "loss": 0.0028, "reward": 6.876392841339111, "reward_std": 2.1439974308013916, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5263932347297668, "rewards/reward_search_strategy": 0.8499999642372131, "step": 321 }, { "completion_length": 640.375, "epoch": 1.1258741258741258, "grad_norm": 0.5233185887336731, "kl": 0.05953974276781082, "learning_rate": 4.286181699082008e-06, "loss": 0.0024, "reward": 5.468829154968262, "reward_std": 2.4537880420684814, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.44382914900779724, "rewards/reward_search_strategy": 0.7749999761581421, "step": 322 }, { "completion_length": 425.5, "epoch": 1.1293706293706294, "grad_norm": 0.5920670628547668, "kl": 0.06298626214265823, "learning_rate": 4.280065114977492e-06, "loss": 0.0025, "reward": 2.663602590560913, "reward_std": 2.2890970706939697, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5136026740074158, "rewards/reward_search_strategy": 0.6500000357627869, "step": 323 }, { "completion_length": 406.875, "epoch": 1.132867132867133, "grad_norm": 0.6604630947113037, "kl": 0.08841750025749207, "learning_rate": 4.273926841341303e-06, "loss": 0.0035, "reward": 5.190492153167725, "reward_std": 2.3081531524658203, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5654920339584351, "rewards/reward_search_strategy": 0.75, "step": 324 }, { "completion_length": 367.375, "epoch": 1.1363636363636362, "grad_norm": 6.282087802886963, "kl": 3.9899542331695557, "learning_rate": 4.267766952966369e-06, "loss": 0.1596, "reward": 3.679385185241699, "reward_std": 3.0263350009918213, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.45438525080680847, "rewards/reward_search_strategy": 0.4749999940395355, "step": 325 }, { "completion_length": 544.25, "epoch": 1.1398601398601398, "grad_norm": 1.237210988998413, "kl": 0.1231246143579483, "learning_rate": 4.261585524908987e-06, "loss": 0.0049, "reward": 3.501136064529419, "reward_std": 2.881526231765747, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4511359930038452, "rewards/reward_search_strategy": 0.675000011920929, "step": 326 }, { "completion_length": 564.0, "epoch": 1.1433566433566433, "grad_norm": 0.504810094833374, "kl": 0.09063772112131119, "learning_rate": 4.255382632487907e-06, "loss": 0.0036, "reward": 3.910247325897217, "reward_std": 1.80642831325531, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5602472424507141, "rewards/reward_search_strategy": 0.4750000238418579, "step": 327 }, { "completion_length": 374.5, "epoch": 1.1468531468531469, "grad_norm": 0.7342034578323364, "kl": 0.08345510065555573, "learning_rate": 4.249158351283414e-06, "loss": 0.0033, "reward": 5.706239700317383, "reward_std": 3.2305665016174316, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.53123939037323, "rewards/reward_search_strategy": 0.675000011920929, "step": 328 }, { "completion_length": 527.25, "epoch": 1.1503496503496504, "grad_norm": 0.46101129055023193, "kl": 0.07814895361661911, "learning_rate": 4.242912757136412e-06, "loss": 0.0031, "reward": 3.4161453247070312, "reward_std": 2.4508216381073, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5411453247070312, "rewards/reward_search_strategy": 0.5, "step": 329 }, { "completion_length": 363.875, "epoch": 1.1538461538461537, "grad_norm": 0.8629273176193237, "kl": 0.1023842915892601, "learning_rate": 4.236645926147493e-06, "loss": 0.0041, "reward": 5.935527324676514, "reward_std": 2.9570200443267822, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5355273485183716, "rewards/reward_search_strategy": 0.7749999761581421, "step": 330 }, { "completion_length": 523.25, "epoch": 1.1573426573426573, "grad_norm": 0.4914082884788513, "kl": 0.07481839507818222, "learning_rate": 4.230357934676017e-06, "loss": 0.003, "reward": 5.845967769622803, "reward_std": 2.551992177963257, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.495967835187912, "rewards/reward_search_strategy": 0.7249999642372131, "step": 331 }, { "completion_length": 382.75, "epoch": 1.1608391608391608, "grad_norm": 0.541487455368042, "kl": 0.11320384591817856, "learning_rate": 4.224048859339175e-06, "loss": 0.0045, "reward": 6.013635635375977, "reward_std": 2.582735300064087, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3386358618736267, "rewards/reward_search_strategy": 0.675000011920929, "step": 332 }, { "completion_length": 728.125, "epoch": 1.1643356643356644, "grad_norm": 0.42518407106399536, "kl": 0.047705646604299545, "learning_rate": 4.217718777011058e-06, "loss": 0.0019, "reward": 3.8004541397094727, "reward_std": 2.1054131984710693, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.35045403242111206, "rewards/reward_search_strategy": 0.699999988079071, "step": 333 }, { "completion_length": 310.375, "epoch": 1.167832167832168, "grad_norm": 1.1916086673736572, "kl": 0.15560080111026764, "learning_rate": 4.211367764821722e-06, "loss": 0.0062, "reward": 5.576718330383301, "reward_std": 3.0815911293029785, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.30171847343444824, "rewards/reward_search_strategy": 0.6500000357627869, "step": 334 }, { "completion_length": 221.25, "epoch": 1.1713286713286712, "grad_norm": 9.67128849029541, "kl": 0.7930659651756287, "learning_rate": 4.204995900156247e-06, "loss": 0.0317, "reward": 6.268731117248535, "reward_std": 3.307931423187256, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3937305808067322, "rewards/reward_search_strategy": 0.625, "step": 335 }, { "completion_length": 369.75, "epoch": 1.1748251748251748, "grad_norm": 1.1554738283157349, "kl": 0.13195408880710602, "learning_rate": 4.198603260653792e-06, "loss": 0.0053, "reward": 4.597299575805664, "reward_std": 2.33955979347229, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4972996115684509, "rewards/reward_search_strategy": 0.4750000238418579, "step": 336 }, { "completion_length": 411.25, "epoch": 1.1783216783216783, "grad_norm": 0.7392211556434631, "kl": 0.0733492374420166, "learning_rate": 4.192189924206652e-06, "loss": 0.0029, "reward": 4.321833610534668, "reward_std": 2.759119987487793, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.521833598613739, "rewards/reward_search_strategy": 0.800000011920929, "step": 337 }, { "completion_length": 451.875, "epoch": 1.1818181818181819, "grad_norm": 1.921710729598999, "kl": 0.21552717685699463, "learning_rate": 4.185755968959308e-06, "loss": 0.0086, "reward": 4.766526222229004, "reward_std": 2.58585524559021, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5915263891220093, "rewards/reward_search_strategy": 0.675000011920929, "step": 338 }, { "completion_length": 332.25, "epoch": 1.1853146853146854, "grad_norm": 1.0764071941375732, "kl": 0.11709711700677872, "learning_rate": 4.179301473307476e-06, "loss": 0.0047, "reward": 5.342533111572266, "reward_std": 3.2703137397766113, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5675327777862549, "rewards/reward_search_strategy": 0.7749999761581421, "step": 339 }, { "completion_length": 496.5, "epoch": 1.1888111888111887, "grad_norm": 0.7248229384422302, "kl": 0.13478195667266846, "learning_rate": 4.172826515897146e-06, "loss": 0.0054, "reward": 6.131617546081543, "reward_std": 2.5291075706481934, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4816179871559143, "rewards/reward_search_strategy": 0.6500000357627869, "step": 340 }, { "completion_length": 392.75, "epoch": 1.1923076923076923, "grad_norm": 0.6313228011131287, "kl": 0.08677749335765839, "learning_rate": 4.166331175623631e-06, "loss": 0.0035, "reward": 5.503582000732422, "reward_std": 2.0927414894104004, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5785820484161377, "rewards/reward_search_strategy": 0.7999999523162842, "step": 341 }, { "completion_length": 358.75, "epoch": 1.1958041958041958, "grad_norm": 0.5460852384567261, "kl": 0.13055555522441864, "learning_rate": 4.159815531630604e-06, "loss": 0.0052, "reward": 7.100890159606934, "reward_std": 2.1578822135925293, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4008905589580536, "rewards/reward_search_strategy": 0.699999988079071, "step": 342 }, { "completion_length": 228.375, "epoch": 1.1993006993006994, "grad_norm": 0.8770285248756409, "kl": 0.1582285761833191, "learning_rate": 4.15327966330913e-06, "loss": 0.0063, "reward": 6.754122734069824, "reward_std": 3.0570099353790283, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5791229009628296, "rewards/reward_search_strategy": 0.675000011920929, "step": 343 }, { "completion_length": 397.75, "epoch": 1.2027972027972027, "grad_norm": 0.9353595972061157, "kl": 0.21055281162261963, "learning_rate": 4.146723650296701e-06, "loss": 0.0084, "reward": 5.746088981628418, "reward_std": 2.988830327987671, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4960886836051941, "rewards/reward_search_strategy": 0.75, "step": 344 }, { "completion_length": 376.5, "epoch": 1.2062937062937062, "grad_norm": 0.7754123210906982, "kl": 0.10437698662281036, "learning_rate": 4.140147572476269e-06, "loss": 0.0042, "reward": 3.666969060897827, "reward_std": 4.023746490478516, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2669692635536194, "rewards/reward_search_strategy": 0.4000000059604645, "step": 345 }, { "completion_length": 435.625, "epoch": 1.2097902097902098, "grad_norm": 0.9024556875228882, "kl": 0.09636207669973373, "learning_rate": 4.133551509975264e-06, "loss": 0.0039, "reward": 4.0536298751831055, "reward_std": 3.3120667934417725, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5036299824714661, "rewards/reward_search_strategy": 0.550000011920929, "step": 346 }, { "completion_length": 234.375, "epoch": 1.2132867132867133, "grad_norm": 0.8678302764892578, "kl": 0.1372126191854477, "learning_rate": 4.126935543164628e-06, "loss": 0.0055, "reward": 5.475516319274902, "reward_std": 3.4662699699401855, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.45051607489585876, "rewards/reward_search_strategy": 0.6499999761581421, "step": 347 }, { "completion_length": 268.5, "epoch": 1.2167832167832167, "grad_norm": 1.028032660484314, "kl": 0.21305130422115326, "learning_rate": 4.120299752657828e-06, "loss": 0.0085, "reward": 5.076361656188965, "reward_std": 4.117757797241211, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4513613283634186, "rewards/reward_search_strategy": 0.625, "step": 348 }, { "completion_length": 510.375, "epoch": 1.2202797202797202, "grad_norm": 0.9544126391410828, "kl": 0.09230495244264603, "learning_rate": 4.113644219309877e-06, "loss": 0.0037, "reward": 6.58498477935791, "reward_std": 2.8294339179992676, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6849843263626099, "rewards/reward_search_strategy": 0.6499999761581421, "step": 349 }, { "completion_length": 312.875, "epoch": 1.2237762237762237, "grad_norm": 2.1660711765289307, "kl": 0.22974209487438202, "learning_rate": 4.106969024216348e-06, "loss": 0.0092, "reward": 6.715004920959473, "reward_std": 2.784714937210083, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.49000489711761475, "rewards/reward_search_strategy": 0.7250000238418579, "step": 350 }, { "completion_length": 348.25, "epoch": 1.2272727272727273, "grad_norm": 0.9564335346221924, "kl": 0.20949901640415192, "learning_rate": 4.1002742487123896e-06, "loss": 0.0084, "reward": 4.565638542175293, "reward_std": 2.672559976577759, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.39063847064971924, "rewards/reward_search_strategy": 0.675000011920929, "step": 351 }, { "completion_length": 479.5, "epoch": 1.2307692307692308, "grad_norm": 0.6322876214981079, "kl": 0.07695091515779495, "learning_rate": 4.093559974371725e-06, "loss": 0.0031, "reward": 6.6676716804504395, "reward_std": 2.6538872718811035, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5926718711853027, "rewards/reward_search_strategy": 0.8250000476837158, "step": 352 }, { "completion_length": 494.75, "epoch": 1.2342657342657342, "grad_norm": 0.4909517168998718, "kl": 0.07885551452636719, "learning_rate": 4.086826283005669e-06, "loss": 0.0032, "reward": 6.1619181632995605, "reward_std": 2.8411340713500977, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3369182050228119, "rewards/reward_search_strategy": 0.699999988079071, "step": 353 }, { "completion_length": 260.25, "epoch": 1.2377622377622377, "grad_norm": 1.253501057624817, "kl": 0.20247994363307953, "learning_rate": 4.080073256662128e-06, "loss": 0.0081, "reward": 5.574455261230469, "reward_std": 3.5395596027374268, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3494548499584198, "rewards/reward_search_strategy": 0.6000000238418579, "step": 354 }, { "completion_length": 309.125, "epoch": 1.2412587412587412, "grad_norm": 1.7126333713531494, "kl": 0.21196775138378143, "learning_rate": 4.073300977624594e-06, "loss": 0.0085, "reward": 7.307598114013672, "reward_std": 2.9741148948669434, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5575979948043823, "rewards/reward_search_strategy": 0.625, "step": 355 }, { "completion_length": 405.0, "epoch": 1.2447552447552448, "grad_norm": 1.082837462425232, "kl": 0.07227756828069687, "learning_rate": 4.066509528411151e-06, "loss": 0.0029, "reward": 5.161370277404785, "reward_std": 3.016519546508789, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.43637019395828247, "rewards/reward_search_strategy": 0.7249999642372131, "step": 356 }, { "completion_length": 376.25, "epoch": 1.2482517482517483, "grad_norm": 1.099038004875183, "kl": 0.13722370564937592, "learning_rate": 4.059698991773466e-06, "loss": 0.0055, "reward": 3.3820579051971436, "reward_std": 2.766157388687134, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.40705791115760803, "rewards/reward_search_strategy": 0.4749999940395355, "step": 357 }, { "completion_length": 532.625, "epoch": 1.2517482517482517, "grad_norm": 0.6121649146080017, "kl": 0.05823325365781784, "learning_rate": 4.052869450695776e-06, "loss": 0.0023, "reward": 4.155327796936035, "reward_std": 3.0830624103546143, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.530327558517456, "rewards/reward_search_strategy": 0.5, "step": 358 }, { "completion_length": 534.875, "epoch": 1.2552447552447552, "grad_norm": 0.5585082173347473, "kl": 0.07693375647068024, "learning_rate": 4.046020988393886e-06, "loss": 0.0031, "reward": 4.022989273071289, "reward_std": 1.8042207956314087, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3729895353317261, "rewards/reward_search_strategy": 0.6500000357627869, "step": 359 }, { "completion_length": 404.125, "epoch": 1.2587412587412588, "grad_norm": 0.5998912453651428, "kl": 0.08919445425271988, "learning_rate": 4.039153688314146e-06, "loss": 0.0036, "reward": 6.726578712463379, "reward_std": 2.5917811393737793, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.35157859325408936, "rewards/reward_search_strategy": 0.875, "step": 360 }, { "completion_length": 406.5, "epoch": 1.2622377622377623, "grad_norm": 1.4461448192596436, "kl": 0.12945714592933655, "learning_rate": 4.032267634132442e-06, "loss": 0.0052, "reward": 6.075223922729492, "reward_std": 3.8267555236816406, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47522372007369995, "rewards/reward_search_strategy": 0.6000000238418579, "step": 361 }, { "completion_length": 490.25, "epoch": 1.2657342657342658, "grad_norm": 0.5157952308654785, "kl": 0.08816548436880112, "learning_rate": 4.02536290975317e-06, "loss": 0.0035, "reward": 7.19053316116333, "reward_std": 2.843262195587158, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.640533447265625, "rewards/reward_search_strategy": 0.800000011920929, "step": 362 }, { "completion_length": 320.375, "epoch": 1.2692307692307692, "grad_norm": 11.697229385375977, "kl": 0.3807832598686218, "learning_rate": 4.018439599308217e-06, "loss": 0.0152, "reward": 6.845804214477539, "reward_std": 3.1374518871307373, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.42080387473106384, "rewards/reward_search_strategy": 0.800000011920929, "step": 363 }, { "completion_length": 307.75, "epoch": 1.2727272727272727, "grad_norm": 0.8195920586585999, "kl": 0.2704997956752777, "learning_rate": 4.011497787155938e-06, "loss": 0.0108, "reward": 5.673530101776123, "reward_std": 2.617912769317627, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.34852996468544006, "rewards/reward_search_strategy": 0.44999998807907104, "step": 364 }, { "completion_length": 284.0, "epoch": 1.2762237762237763, "grad_norm": 1.018688678741455, "kl": 0.14098995923995972, "learning_rate": 4.0045375578801216e-06, "loss": 0.0056, "reward": 5.075797080993652, "reward_std": 3.588682174682617, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3757967948913574, "rewards/reward_search_strategy": 0.44999998807907104, "step": 365 }, { "completion_length": 364.875, "epoch": 1.2797202797202798, "grad_norm": 0.9165819883346558, "kl": 0.14407169818878174, "learning_rate": 3.997558996288965e-06, "loss": 0.0058, "reward": 5.143270492553711, "reward_std": 3.304353713989258, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5432703495025635, "rewards/reward_search_strategy": 0.4750000238418579, "step": 366 }, { "completion_length": 225.375, "epoch": 1.2832167832167833, "grad_norm": 1.225709319114685, "kl": 0.20389658212661743, "learning_rate": 3.9905621874140396e-06, "loss": 0.0082, "reward": 5.480903625488281, "reward_std": 3.1632080078125, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.455904096364975, "rewards/reward_search_strategy": 0.40000003576278687, "step": 367 }, { "completion_length": 338.875, "epoch": 1.2867132867132867, "grad_norm": 1.0578041076660156, "kl": 0.16953974962234497, "learning_rate": 3.983547216509254e-06, "loss": 0.0068, "reward": 5.733012676239014, "reward_std": 3.0423076152801514, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3580129146575928, "rewards/reward_search_strategy": 0.625, "step": 368 }, { "completion_length": 402.375, "epoch": 1.2902097902097902, "grad_norm": 1.284543752670288, "kl": 0.14602556824684143, "learning_rate": 3.976514169049814e-06, "loss": 0.0058, "reward": 6.073895454406738, "reward_std": 3.7657344341278076, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.42389529943466187, "rewards/reward_search_strategy": 0.6499999761581421, "step": 369 }, { "completion_length": 418.75, "epoch": 1.2937062937062938, "grad_norm": 1.140341877937317, "kl": 0.24927854537963867, "learning_rate": 3.969463130731183e-06, "loss": 0.01, "reward": 3.9747872352600098, "reward_std": 2.9689948558807373, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5747873783111572, "rewards/reward_search_strategy": 0.7749999761581421, "step": 370 }, { "completion_length": 278.25, "epoch": 1.297202797202797, "grad_norm": 1.2817490100860596, "kl": 0.3547288179397583, "learning_rate": 3.96239418746804e-06, "loss": 0.0142, "reward": 4.307415008544922, "reward_std": 2.03360915184021, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4324150085449219, "rewards/reward_search_strategy": 0.625, "step": 371 }, { "completion_length": 383.5, "epoch": 1.3006993006993006, "grad_norm": 0.7459798455238342, "kl": 0.17955949902534485, "learning_rate": 3.955307425393224e-06, "loss": 0.0072, "reward": 5.501554489135742, "reward_std": 3.2705891132354736, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5265547633171082, "rewards/reward_search_strategy": 0.7249999642372131, "step": 372 }, { "completion_length": 309.125, "epoch": 1.3041958041958042, "grad_norm": 1.1568890810012817, "kl": 0.16535663604736328, "learning_rate": 3.948202930856697e-06, "loss": 0.0066, "reward": 6.784092903137207, "reward_std": 3.0337963104248047, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48409315943717957, "rewards/reward_search_strategy": 0.7999999523162842, "step": 373 }, { "completion_length": 295.375, "epoch": 1.3076923076923077, "grad_norm": 0.7384740710258484, "kl": 0.215895876288414, "learning_rate": 3.941080790424483e-06, "loss": 0.0086, "reward": 7.188152313232422, "reward_std": 2.466623067855835, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6631525754928589, "rewards/reward_search_strategy": 0.7749999761581421, "step": 374 }, { "completion_length": 472.25, "epoch": 1.3111888111888113, "grad_norm": 0.5587518215179443, "kl": 0.11894085258245468, "learning_rate": 3.933941090877615e-06, "loss": 0.0048, "reward": 5.4161481857299805, "reward_std": 2.650923728942871, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5661482810974121, "rewards/reward_search_strategy": 0.8499999642372131, "step": 375 }, { "completion_length": 246.75, "epoch": 1.3146853146853146, "grad_norm": 0.9946545362472534, "kl": 0.1861283779144287, "learning_rate": 3.92678391921108e-06, "loss": 0.0074, "reward": 6.748879432678223, "reward_std": 3.111686944961548, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3488798141479492, "rewards/reward_search_strategy": 0.7749999761581421, "step": 376 }, { "completion_length": 281.75, "epoch": 1.3181818181818181, "grad_norm": 11.089983940124512, "kl": 4.762326240539551, "learning_rate": 3.9196093626327535e-06, "loss": 0.1905, "reward": 4.622366905212402, "reward_std": 2.756638288497925, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5723665952682495, "rewards/reward_search_strategy": 0.800000011920929, "step": 377 }, { "completion_length": 346.75, "epoch": 1.3216783216783217, "grad_norm": 0.6730005741119385, "kl": 0.1391105055809021, "learning_rate": 3.912417508562345e-06, "loss": 0.0056, "reward": 4.503476142883301, "reward_std": 2.2465193271636963, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6034764647483826, "rewards/reward_search_strategy": 0.6500000357627869, "step": 378 }, { "completion_length": 473.125, "epoch": 1.3251748251748252, "grad_norm": 0.8451654314994812, "kl": 0.12520407140254974, "learning_rate": 3.905208444630326e-06, "loss": 0.005, "reward": 4.186497688293457, "reward_std": 3.0242393016815186, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5614977478981018, "rewards/reward_search_strategy": 0.5, "step": 379 }, { "completion_length": 246.25, "epoch": 1.3286713286713288, "grad_norm": 0.9249701499938965, "kl": 0.2778664827346802, "learning_rate": 3.897982258676867e-06, "loss": 0.0111, "reward": 5.40871000289917, "reward_std": 3.004138231277466, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5337100028991699, "rewards/reward_search_strategy": 0.625, "step": 380 }, { "completion_length": 502.75, "epoch": 1.332167832167832, "grad_norm": 0.46744218468666077, "kl": 0.09883429110050201, "learning_rate": 3.890739038750763e-06, "loss": 0.004, "reward": 4.416123390197754, "reward_std": 1.8906164169311523, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6411235332489014, "rewards/reward_search_strategy": 0.7749999761581421, "step": 381 }, { "completion_length": 249.625, "epoch": 1.3356643356643356, "grad_norm": 0.8567023873329163, "kl": 0.16580483317375183, "learning_rate": 3.88347887310836e-06, "loss": 0.0066, "reward": 7.747579097747803, "reward_std": 1.4630359411239624, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5475790500640869, "rewards/reward_search_strategy": 0.824999988079071, "step": 382 }, { "completion_length": 283.5, "epoch": 1.3391608391608392, "grad_norm": 1.3123666048049927, "kl": 0.2037430703639984, "learning_rate": 3.876201850212489e-06, "loss": 0.0081, "reward": 6.500353813171387, "reward_std": 3.7146830558776855, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40035414695739746, "rewards/reward_search_strategy": 0.7250000238418579, "step": 383 }, { "completion_length": 519.625, "epoch": 1.3426573426573427, "grad_norm": 1.0492886304855347, "kl": 0.09502134472131729, "learning_rate": 3.868908058731376e-06, "loss": 0.0038, "reward": 4.339066505432129, "reward_std": 2.7583582401275635, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3890661597251892, "rewards/reward_search_strategy": 0.574999988079071, "step": 384 }, { "completion_length": 439.5, "epoch": 1.3461538461538463, "grad_norm": 0.824394166469574, "kl": 0.09405747056007385, "learning_rate": 3.861597587537568e-06, "loss": 0.0038, "reward": 3.9054057598114014, "reward_std": 2.7032172679901123, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.2804058790206909, "rewards/reward_search_strategy": 0.625, "step": 385 }, { "completion_length": 357.25, "epoch": 1.3496503496503496, "grad_norm": 0.8456993103027344, "kl": 0.1805035024881363, "learning_rate": 3.85427052570685e-06, "loss": 0.0072, "reward": 6.4848246574401855, "reward_std": 2.7532103061676025, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5098246932029724, "rewards/reward_search_strategy": 0.7249999642372131, "step": 386 }, { "completion_length": 282.25, "epoch": 1.3531468531468531, "grad_norm": 1.6480878591537476, "kl": 0.17681656777858734, "learning_rate": 3.846926962517158e-06, "loss": 0.0071, "reward": 6.323261737823486, "reward_std": 2.6390492916107178, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4482613801956177, "rewards/reward_search_strategy": 0.6250000596046448, "step": 387 }, { "completion_length": 237.125, "epoch": 1.3566433566433567, "grad_norm": 3.1040449142456055, "kl": 0.48438748717308044, "learning_rate": 3.839566987447492e-06, "loss": 0.0194, "reward": 6.36116361618042, "reward_std": 2.4046599864959717, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3111635446548462, "rewards/reward_search_strategy": 0.30000001192092896, "step": 388 }, { "completion_length": 350.5, "epoch": 1.3601398601398602, "grad_norm": 1.3969569206237793, "kl": 0.14033286273479462, "learning_rate": 3.832190690176825e-06, "loss": 0.0056, "reward": 2.188044786453247, "reward_std": 1.4462171792984009, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.16304486989974976, "rewards/reward_search_strategy": 0.4000000059604645, "step": 389 }, { "completion_length": 326.125, "epoch": 1.3636363636363638, "grad_norm": 1.3200170993804932, "kl": 0.15739156305789948, "learning_rate": 3.824798160583012e-06, "loss": 0.0063, "reward": 4.655465126037598, "reward_std": 2.911059856414795, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4304652214050293, "rewards/reward_search_strategy": 0.6000000238418579, "step": 390 }, { "completion_length": 618.625, "epoch": 1.367132867132867, "grad_norm": 0.6044898629188538, "kl": 0.10862935334444046, "learning_rate": 3.817389488741694e-06, "loss": 0.0043, "reward": 5.85195255279541, "reward_std": 2.695142984390259, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5269524455070496, "rewards/reward_search_strategy": 0.824999988079071, "step": 391 }, { "completion_length": 388.125, "epoch": 1.3706293706293706, "grad_norm": 0.757124662399292, "kl": 0.11755659431219101, "learning_rate": 3.8099647649251984e-06, "loss": 0.0047, "reward": 4.8429670333862305, "reward_std": 2.5176022052764893, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.39296701550483704, "rewards/reward_search_strategy": 0.8250000476837158, "step": 392 }, { "completion_length": 383.0, "epoch": 1.3741258741258742, "grad_norm": 0.6554877758026123, "kl": 0.15485447645187378, "learning_rate": 3.802524079601442e-06, "loss": 0.0062, "reward": 3.9751625061035156, "reward_std": 2.614086151123047, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4751623272895813, "rewards/reward_search_strategy": 0.625, "step": 393 }, { "completion_length": 368.125, "epoch": 1.3776223776223775, "grad_norm": 0.6346847414970398, "kl": 0.11959197372198105, "learning_rate": 3.795067523432826e-06, "loss": 0.0048, "reward": 4.5723676681518555, "reward_std": 2.505882978439331, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.49736788868904114, "rewards/reward_search_strategy": 0.7000000476837158, "step": 394 }, { "completion_length": 249.875, "epoch": 1.381118881118881, "grad_norm": 0.7225646376609802, "kl": 0.14031687378883362, "learning_rate": 3.787595187275136e-06, "loss": 0.0056, "reward": 7.360940933227539, "reward_std": 1.6116032600402832, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.23594066500663757, "rewards/reward_search_strategy": 0.625, "step": 395 }, { "completion_length": 401.875, "epoch": 1.3846153846153846, "grad_norm": 0.7882583141326904, "kl": 0.1797572374343872, "learning_rate": 3.780107162176429e-06, "loss": 0.0072, "reward": 3.243696689605713, "reward_std": 2.612200975418091, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5436968803405762, "rewards/reward_search_strategy": 0.574999988079071, "step": 396 }, { "completion_length": 294.25, "epoch": 1.3881118881118881, "grad_norm": 3.533095598220825, "kl": 1.1287881135940552, "learning_rate": 3.772603539375929e-06, "loss": 0.0452, "reward": 2.366114616394043, "reward_std": 1.650844693183899, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4661146402359009, "rewards/reward_search_strategy": 0.40000003576278687, "step": 397 }, { "completion_length": 413.75, "epoch": 1.3916083916083917, "grad_norm": 0.672588586807251, "kl": 0.12809911370277405, "learning_rate": 3.7650844103029093e-06, "loss": 0.0051, "reward": 4.5074543952941895, "reward_std": 1.824966549873352, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6074544191360474, "rewards/reward_search_strategy": 0.6499999761581421, "step": 398 }, { "completion_length": 327.375, "epoch": 1.395104895104895, "grad_norm": 1.236580729484558, "kl": 0.19380441308021545, "learning_rate": 3.7575498665755884e-06, "loss": 0.0078, "reward": 5.028186321258545, "reward_std": 3.838682174682617, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.45318639278411865, "rewards/reward_search_strategy": 0.699999988079071, "step": 399 }, { "completion_length": 342.375, "epoch": 1.3986013986013985, "grad_norm": 0.9333245754241943, "kl": 0.20195768773555756, "learning_rate": 3.7500000000000005e-06, "loss": 0.0081, "reward": 3.8275704383850098, "reward_std": 3.027420997619629, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.502570390701294, "rewards/reward_search_strategy": 0.45000001788139343, "step": 400 }, { "completion_length": 289.625, "epoch": 1.402097902097902, "grad_norm": 0.7431657314300537, "kl": 0.1940464824438095, "learning_rate": 3.742434902568889e-06, "loss": 0.0078, "reward": 6.76303768157959, "reward_std": 1.9291017055511475, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6130377054214478, "rewards/reward_search_strategy": 0.6499999761581421, "step": 401 }, { "completion_length": 236.625, "epoch": 1.4055944055944056, "grad_norm": 1.0157201290130615, "kl": 0.28472885489463806, "learning_rate": 3.7348546664605777e-06, "loss": 0.0114, "reward": 4.573916435241699, "reward_std": 3.343165636062622, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3239161968231201, "rewards/reward_search_strategy": 0.375, "step": 402 }, { "completion_length": 370.75, "epoch": 1.4090909090909092, "grad_norm": 0.6442353129386902, "kl": 0.13476325571537018, "learning_rate": 3.7272593840378526e-06, "loss": 0.0054, "reward": 5.87241268157959, "reward_std": 2.7663140296936035, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.32241225242614746, "rewards/reward_search_strategy": 0.800000011920929, "step": 403 }, { "completion_length": 344.875, "epoch": 1.4125874125874125, "grad_norm": 0.9788044691085815, "kl": 0.16146501898765564, "learning_rate": 3.7196491478468322e-06, "loss": 0.0065, "reward": 6.843417167663574, "reward_std": 2.5316994190216064, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.44341692328453064, "rewards/reward_search_strategy": 0.7749999761581421, "step": 404 }, { "completion_length": 300.125, "epoch": 1.416083916083916, "grad_norm": 0.9080623984336853, "kl": 0.20929095149040222, "learning_rate": 3.7120240506158433e-06, "loss": 0.0084, "reward": 6.924595832824707, "reward_std": 2.5679879188537598, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29959601163864136, "rewards/reward_search_strategy": 0.625, "step": 405 }, { "completion_length": 347.875, "epoch": 1.4195804195804196, "grad_norm": 0.9922822713851929, "kl": 0.16099223494529724, "learning_rate": 3.7043841852542884e-06, "loss": 0.0064, "reward": 5.190982818603516, "reward_std": 3.008113145828247, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.41598278284072876, "rewards/reward_search_strategy": 0.5249999761581421, "step": 406 }, { "completion_length": 414.25, "epoch": 1.4230769230769231, "grad_norm": 0.8010644912719727, "kl": 0.12256429344415665, "learning_rate": 3.6967296448515176e-06, "loss": 0.0049, "reward": 5.230372428894043, "reward_std": 3.308673143386841, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5303722620010376, "rewards/reward_search_strategy": 0.5750000476837158, "step": 407 }, { "completion_length": 248.0, "epoch": 1.4265734265734267, "grad_norm": 1.6469920873641968, "kl": 0.2045002579689026, "learning_rate": 3.689060522675689e-06, "loss": 0.0082, "reward": 6.9636945724487305, "reward_std": 2.935652017593384, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.36369478702545166, "rewards/reward_search_strategy": 0.7250000238418579, "step": 408 }, { "completion_length": 426.5, "epoch": 1.43006993006993, "grad_norm": 0.7976499199867249, "kl": 0.1475878804922104, "learning_rate": 3.6813769121726356e-06, "loss": 0.0059, "reward": 3.9527182579040527, "reward_std": 2.7393836975097656, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40271830558776855, "rewards/reward_search_strategy": 0.42500001192092896, "step": 409 }, { "completion_length": 384.125, "epoch": 1.4335664335664335, "grad_norm": 1.68340265750885, "kl": 0.2281363308429718, "learning_rate": 3.6736789069647273e-06, "loss": 0.0091, "reward": 4.657155990600586, "reward_std": 3.039395332336426, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48215600848197937, "rewards/reward_search_strategy": 0.550000011920929, "step": 410 }, { "completion_length": 394.125, "epoch": 1.437062937062937, "grad_norm": 0.4666031002998352, "kl": 0.1385246068239212, "learning_rate": 3.6659666008497287e-06, "loss": 0.0055, "reward": 3.309140205383301, "reward_std": 1.4426900148391724, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.409140020608902, "rewards/reward_search_strategy": 0.6499999761581421, "step": 411 }, { "completion_length": 232.375, "epoch": 1.4405594405594406, "grad_norm": 1.0850403308868408, "kl": 0.23393404483795166, "learning_rate": 3.658240087799655e-06, "loss": 0.0094, "reward": 5.597710609436035, "reward_std": 2.9239015579223633, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3727110028266907, "rewards/reward_search_strategy": 0.5999999642372131, "step": 412 }, { "completion_length": 329.875, "epoch": 1.4440559440559442, "grad_norm": 1.1676043272018433, "kl": 0.21017572283744812, "learning_rate": 3.6504994619596295e-06, "loss": 0.0084, "reward": 7.767879486083984, "reward_std": 1.3231239318847656, "rewards/reward_correctness": 1.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49287882447242737, "rewards/reward_search_strategy": 0.6499999761581421, "step": 413 }, { "completion_length": 286.875, "epoch": 1.4475524475524475, "grad_norm": 1.0990068912506104, "kl": 0.21051634848117828, "learning_rate": 3.642744817646736e-06, "loss": 0.0084, "reward": 4.044559001922607, "reward_std": 3.1820321083068848, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.294559121131897, "rewards/reward_search_strategy": 0.375, "step": 414 }, { "completion_length": 320.75, "epoch": 1.451048951048951, "grad_norm": 1.5767216682434082, "kl": 0.18334715068340302, "learning_rate": 3.634976249348867e-06, "loss": 0.0073, "reward": 5.833085536956787, "reward_std": 3.1178810596466064, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48308563232421875, "rewards/reward_search_strategy": 0.7249999642372131, "step": 415 }, { "completion_length": 369.125, "epoch": 1.4545454545454546, "grad_norm": 1.140123963356018, "kl": 0.19512750208377838, "learning_rate": 3.627193851723577e-06, "loss": 0.0078, "reward": 2.1661739349365234, "reward_std": 1.7770532369613647, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.44117406010627747, "rewards/reward_search_strategy": 0.3500000238418579, "step": 416 }, { "completion_length": 224.75, "epoch": 1.458041958041958, "grad_norm": 0.892484724521637, "kl": 0.2562803626060486, "learning_rate": 3.6193977195969243e-06, "loss": 0.0103, "reward": 3.8352110385894775, "reward_std": 3.6014862060546875, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2602110207080841, "rewards/reward_search_strategy": 0.32500001788139343, "step": 417 }, { "completion_length": 275.0, "epoch": 1.4615384615384617, "grad_norm": 1.5628401041030884, "kl": 0.2238333523273468, "learning_rate": 3.611587947962319e-06, "loss": 0.009, "reward": 5.637851715087891, "reward_std": 3.907928466796875, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.537851870059967, "rewards/reward_search_strategy": 0.4749999940395355, "step": 418 }, { "completion_length": 542.375, "epoch": 1.465034965034965, "grad_norm": 1.147643804550171, "kl": 0.17975068092346191, "learning_rate": 3.6037646319793635e-06, "loss": 0.0072, "reward": 2.3722047805786133, "reward_std": 1.6781915426254272, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4472048282623291, "rewards/reward_search_strategy": 0.550000011920929, "step": 419 }, { "completion_length": 384.125, "epoch": 1.4685314685314685, "grad_norm": 1.0817137956619263, "kl": 0.20022520422935486, "learning_rate": 3.595927866972694e-06, "loss": 0.008, "reward": 3.1727235317230225, "reward_std": 2.7517311573028564, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5477235317230225, "rewards/reward_search_strategy": 0.5, "step": 420 }, { "completion_length": 336.875, "epoch": 1.472027972027972, "grad_norm": 0.7116675972938538, "kl": 0.1551167070865631, "learning_rate": 3.5880777484308193e-06, "loss": 0.0062, "reward": 5.459067344665527, "reward_std": 2.214207649230957, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.2840671241283417, "rewards/reward_search_strategy": 0.925000011920929, "step": 421 }, { "completion_length": 598.25, "epoch": 1.4755244755244754, "grad_norm": 0.4839644432067871, "kl": 0.2022503912448883, "learning_rate": 3.5802143720049565e-06, "loss": 0.0081, "reward": 7.548956394195557, "reward_std": 2.6123108863830566, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5989563465118408, "rewards/reward_search_strategy": 0.824999988079071, "step": 422 }, { "completion_length": 448.75, "epoch": 1.479020979020979, "grad_norm": 0.6794417500495911, "kl": 0.18201248347759247, "learning_rate": 3.5723378335078653e-06, "loss": 0.0073, "reward": 6.285902500152588, "reward_std": 2.544133424758911, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5859024524688721, "rewards/reward_search_strategy": 0.824999988079071, "step": 423 }, { "completion_length": 424.125, "epoch": 1.4825174825174825, "grad_norm": 0.7091171741485596, "kl": 0.14801469445228577, "learning_rate": 3.564448228912682e-06, "loss": 0.0059, "reward": 3.324188232421875, "reward_std": 2.133406400680542, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6241883039474487, "rewards/reward_search_strategy": 0.574999988079071, "step": 424 }, { "completion_length": 335.625, "epoch": 1.486013986013986, "grad_norm": 1.1051809787750244, "kl": 0.1744394153356552, "learning_rate": 3.556545654351749e-06, "loss": 0.007, "reward": 3.0275347232818604, "reward_std": 2.2897677421569824, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5275347232818604, "rewards/reward_search_strategy": 0.625, "step": 425 }, { "completion_length": 313.5, "epoch": 1.4895104895104896, "grad_norm": 1.1032415628433228, "kl": 0.29376858472824097, "learning_rate": 3.5486302061154433e-06, "loss": 0.0118, "reward": 4.647106170654297, "reward_std": 3.4931628704071045, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4721059799194336, "rewards/reward_search_strategy": 0.675000011920929, "step": 426 }, { "completion_length": 305.625, "epoch": 1.493006993006993, "grad_norm": 0.8230642080307007, "kl": 0.16636496782302856, "learning_rate": 3.5407019806510035e-06, "loss": 0.0067, "reward": 6.975151062011719, "reward_std": 1.4824897050857544, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.525151252746582, "rewards/reward_search_strategy": 0.824999988079071, "step": 427 }, { "completion_length": 428.5, "epoch": 1.4965034965034965, "grad_norm": 0.6625391244888306, "kl": 0.11103808134794235, "learning_rate": 3.532761074561355e-06, "loss": 0.0044, "reward": 2.761242389678955, "reward_std": 1.572056770324707, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43624237179756165, "rewards/reward_search_strategy": 0.44999998807907104, "step": 428 }, { "completion_length": 370.25, "epoch": 1.5, "grad_norm": 1.137886643409729, "kl": 0.2061176896095276, "learning_rate": 3.524807584603932e-06, "loss": 0.0082, "reward": 5.234549522399902, "reward_std": 3.020899534225464, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.434549480676651, "rewards/reward_search_strategy": 0.675000011920929, "step": 429 }, { "completion_length": 460.875, "epoch": 1.5034965034965035, "grad_norm": 0.6655659675598145, "kl": 0.13574932515621185, "learning_rate": 3.516841607689501e-06, "loss": 0.0054, "reward": 4.962242126464844, "reward_std": 2.2505006790161133, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6372420787811279, "rewards/reward_search_strategy": 0.574999988079071, "step": 430 }, { "completion_length": 449.75, "epoch": 1.506993006993007, "grad_norm": 1.6040316820144653, "kl": 0.2962697148323059, "learning_rate": 3.5088632408809757e-06, "loss": 0.0119, "reward": 2.212186574935913, "reward_std": 1.8170539140701294, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.312186598777771, "rewards/reward_search_strategy": 0.4000000059604645, "step": 431 }, { "completion_length": 239.75, "epoch": 1.5104895104895104, "grad_norm": 2.0863418579101562, "kl": 0.42183053493499756, "learning_rate": 3.5008725813922383e-06, "loss": 0.0169, "reward": 5.592342376708984, "reward_std": 3.7416441440582275, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.292342871427536, "rewards/reward_search_strategy": 0.550000011920929, "step": 432 }, { "completion_length": 215.875, "epoch": 1.513986013986014, "grad_norm": 1.3532419204711914, "kl": 0.34601929783821106, "learning_rate": 3.4928697265869516e-06, "loss": 0.0138, "reward": 2.0051121711730957, "reward_std": 1.8395854234695435, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30511218309402466, "rewards/reward_search_strategy": 0.32499998807907104, "step": 433 }, { "completion_length": 289.625, "epoch": 1.5174825174825175, "grad_norm": 2.330354928970337, "kl": 0.4995352625846863, "learning_rate": 3.4848547739773782e-06, "loss": 0.02, "reward": 2.3652281761169434, "reward_std": 1.205407977104187, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4152282774448395, "rewards/reward_search_strategy": 0.44999998807907104, "step": 434 }, { "completion_length": 480.0, "epoch": 1.5209790209790208, "grad_norm": 0.8131763935089111, "kl": 0.21656283736228943, "learning_rate": 3.476827821223184e-06, "loss": 0.0087, "reward": 3.1841320991516113, "reward_std": 2.568476438522339, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5091319680213928, "rewards/reward_search_strategy": 0.550000011920929, "step": 435 }, { "completion_length": 393.25, "epoch": 1.5244755244755246, "grad_norm": 0.9795050621032715, "kl": 0.23742111027240753, "learning_rate": 3.4687889661302577e-06, "loss": 0.0095, "reward": 5.273265361785889, "reward_std": 3.0285797119140625, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5982652306556702, "rewards/reward_search_strategy": 0.675000011920929, "step": 436 }, { "completion_length": 477.375, "epoch": 1.527972027972028, "grad_norm": 1.0040581226348877, "kl": 0.1645866483449936, "learning_rate": 3.460738306649509e-06, "loss": 0.0066, "reward": 3.343968152999878, "reward_std": 1.9579272270202637, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.26896828413009644, "rewards/reward_search_strategy": 0.699999988079071, "step": 437 }, { "completion_length": 418.375, "epoch": 1.5314685314685315, "grad_norm": 0.6381561160087585, "kl": 0.1781652718782425, "learning_rate": 3.452675940875686e-06, "loss": 0.0071, "reward": 4.517590522766113, "reward_std": 1.709311842918396, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6175904273986816, "rewards/reward_search_strategy": 0.5249999761581421, "step": 438 }, { "completion_length": 324.625, "epoch": 1.534965034965035, "grad_norm": 1.2970616817474365, "kl": 0.2414904087781906, "learning_rate": 3.4446019670461684e-06, "loss": 0.0097, "reward": 3.321227788925171, "reward_std": 1.2823264598846436, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47122782468795776, "rewards/reward_search_strategy": 0.4750000238418579, "step": 439 }, { "completion_length": 398.25, "epoch": 1.5384615384615383, "grad_norm": 1.4675123691558838, "kl": 0.30049189925193787, "learning_rate": 3.436516483539781e-06, "loss": 0.012, "reward": 3.52902889251709, "reward_std": 2.5919923782348633, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47902899980545044, "rewards/reward_search_strategy": 0.550000011920929, "step": 440 }, { "completion_length": 424.375, "epoch": 1.541958041958042, "grad_norm": 2.437140941619873, "kl": 0.4226469099521637, "learning_rate": 3.4284195888755877e-06, "loss": 0.0169, "reward": 4.794424057006836, "reward_std": 3.2916970252990723, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5194237232208252, "rewards/reward_search_strategy": 0.6499999761581421, "step": 441 }, { "completion_length": 520.125, "epoch": 1.5454545454545454, "grad_norm": 0.6724218130111694, "kl": 0.23738737404346466, "learning_rate": 3.4203113817116955e-06, "loss": 0.0095, "reward": 6.539656639099121, "reward_std": 2.3404319286346436, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4896567165851593, "rewards/reward_search_strategy": 0.550000011920929, "step": 442 }, { "completion_length": 475.625, "epoch": 1.548951048951049, "grad_norm": 0.793103039264679, "kl": 0.2378067821264267, "learning_rate": 3.412191960844049e-06, "loss": 0.0095, "reward": 3.998465061187744, "reward_std": 2.821582794189453, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5234651565551758, "rewards/reward_search_strategy": 0.6000000238418579, "step": 443 }, { "completion_length": 488.25, "epoch": 1.5524475524475525, "grad_norm": 0.6211789846420288, "kl": 0.16191114485263824, "learning_rate": 3.4040614252052305e-06, "loss": 0.0065, "reward": 4.963681221008301, "reward_std": 2.8725788593292236, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5636811256408691, "rewards/reward_search_strategy": 0.6499999761581421, "step": 444 }, { "completion_length": 266.25, "epoch": 1.5559440559440558, "grad_norm": 1.6396141052246094, "kl": 0.42981529235839844, "learning_rate": 3.39591987386325e-06, "loss": 0.0172, "reward": 3.585543155670166, "reward_std": 2.793571710586548, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5105429887771606, "rewards/reward_search_strategy": 0.5750000476837158, "step": 445 }, { "completion_length": 511.125, "epoch": 1.5594405594405596, "grad_norm": 0.6755155920982361, "kl": 0.1948806196451187, "learning_rate": 3.387767406020343e-06, "loss": 0.0078, "reward": 6.462329864501953, "reward_std": 2.490957021713257, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3623296320438385, "rewards/reward_search_strategy": 0.6000000238418579, "step": 446 }, { "completion_length": 309.0, "epoch": 1.562937062937063, "grad_norm": 0.9001058340072632, "kl": 0.23154594004154205, "learning_rate": 3.3796041210117545e-06, "loss": 0.0093, "reward": 5.0776519775390625, "reward_std": 2.2008752822875977, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5276516675949097, "rewards/reward_search_strategy": 0.550000011920929, "step": 447 }, { "completion_length": 395.75, "epoch": 1.5664335664335665, "grad_norm": 0.96751868724823, "kl": 0.24724078178405762, "learning_rate": 3.3714301183045382e-06, "loss": 0.0099, "reward": 5.352873802185059, "reward_std": 3.4447824954986572, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4778738021850586, "rewards/reward_search_strategy": 0.5, "step": 448 }, { "completion_length": 596.5, "epoch": 1.56993006993007, "grad_norm": 0.5340610146522522, "kl": 0.11072459816932678, "learning_rate": 3.3632454974963368e-06, "loss": 0.0044, "reward": 4.271090507507324, "reward_std": 0.11953616142272949, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6210907101631165, "rewards/reward_search_strategy": 0.6499999761581421, "step": 449 }, { "completion_length": 577.375, "epoch": 1.5734265734265733, "grad_norm": 0.7681328058242798, "kl": 0.1562780886888504, "learning_rate": 3.3550503583141726e-06, "loss": 0.0063, "reward": 3.9707911014556885, "reward_std": 2.6017041206359863, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3957912027835846, "rewards/reward_search_strategy": 0.5750000476837158, "step": 450 }, { "completion_length": 354.125, "epoch": 1.5769230769230769, "grad_norm": 0.7792837023735046, "kl": 0.16238777339458466, "learning_rate": 3.346844800613229e-06, "loss": 0.0065, "reward": 5.095845699310303, "reward_std": 2.282075881958008, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5708456039428711, "rewards/reward_search_strategy": 0.6499999761581421, "step": 451 }, { "completion_length": 327.5, "epoch": 1.5804195804195804, "grad_norm": 6.922494888305664, "kl": 2.356689929962158, "learning_rate": 3.338628924375638e-06, "loss": 0.0943, "reward": 4.468880653381348, "reward_std": 3.3904285430908203, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.543880820274353, "rewards/reward_search_strategy": 0.675000011920929, "step": 452 }, { "completion_length": 207.0, "epoch": 1.583916083916084, "grad_norm": 1.4627685546875, "kl": 0.6507488489151001, "learning_rate": 3.3304028297092583e-06, "loss": 0.026, "reward": 7.305324554443359, "reward_std": 1.5086084604263306, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.280324250459671, "rewards/reward_search_strategy": 0.6500000357627869, "step": 453 }, { "completion_length": 614.5, "epoch": 1.5874125874125875, "grad_norm": 1.2405743598937988, "kl": 0.33275243639945984, "learning_rate": 3.3221666168464584e-06, "loss": 0.0133, "reward": 2.3712778091430664, "reward_std": 2.258321762084961, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4462779760360718, "rewards/reward_search_strategy": 0.30000001192092896, "step": 454 }, { "completion_length": 568.625, "epoch": 1.5909090909090908, "grad_norm": 0.5220686793327332, "kl": 0.11742983013391495, "learning_rate": 3.313920386142892e-06, "loss": 0.0047, "reward": 3.185084819793701, "reward_std": 2.038414478302002, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4850848317146301, "rewards/reward_search_strategy": 0.699999988079071, "step": 455 }, { "completion_length": 622.125, "epoch": 1.5944055944055944, "grad_norm": 0.7013587355613708, "kl": 0.12143175303936005, "learning_rate": 3.3056642380762783e-06, "loss": 0.0049, "reward": 3.234304904937744, "reward_std": 2.801846742630005, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5843049883842468, "rewards/reward_search_strategy": 0.4000000059604645, "step": 456 }, { "completion_length": 509.5, "epoch": 1.597902097902098, "grad_norm": 0.5290786027908325, "kl": 0.09905970096588135, "learning_rate": 3.2973982732451753e-06, "loss": 0.004, "reward": 4.47747278213501, "reward_std": 2.0540218353271484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5774728655815125, "rewards/reward_search_strategy": 0.5249999761581421, "step": 457 }, { "completion_length": 490.125, "epoch": 1.6013986013986012, "grad_norm": 0.5015708804130554, "kl": 0.14589592814445496, "learning_rate": 3.2891225923677565e-06, "loss": 0.0058, "reward": 5.786191463470459, "reward_std": 2.496694326400757, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.43619152903556824, "rewards/reward_search_strategy": 0.8500000238418579, "step": 458 }, { "completion_length": 367.875, "epoch": 1.604895104895105, "grad_norm": 0.6818746328353882, "kl": 0.19857680797576904, "learning_rate": 3.280837296280582e-06, "loss": 0.0079, "reward": 5.759618759155273, "reward_std": 2.648275136947632, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4596188962459564, "rewards/reward_search_strategy": 0.550000011920929, "step": 459 }, { "completion_length": 492.125, "epoch": 1.6083916083916083, "grad_norm": 0.7230591177940369, "kl": 0.17880992591381073, "learning_rate": 3.272542485937369e-06, "loss": 0.0072, "reward": 5.427669525146484, "reward_std": 2.917447328567505, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.7776695489883423, "rewards/reward_search_strategy": 0.7749999761581421, "step": 460 }, { "completion_length": 462.5, "epoch": 1.6118881118881119, "grad_norm": 0.903525173664093, "kl": 0.15019932389259338, "learning_rate": 3.2642382624077647e-06, "loss": 0.006, "reward": 3.903940439224243, "reward_std": 1.8853068351745605, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5539405345916748, "rewards/reward_search_strategy": 0.6000000238418579, "step": 461 }, { "completion_length": 389.125, "epoch": 1.6153846153846154, "grad_norm": 0.7114129662513733, "kl": 0.17341738939285278, "learning_rate": 3.2559247268761117e-06, "loss": 0.0069, "reward": 2.86030912399292, "reward_std": 1.3243414163589478, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.385309100151062, "rewards/reward_search_strategy": 0.4749999940395355, "step": 462 }, { "completion_length": 497.5, "epoch": 1.6188811188811187, "grad_norm": 1.1611593961715698, "kl": 0.1519286334514618, "learning_rate": 3.247601980640217e-06, "loss": 0.0061, "reward": 5.231886386871338, "reward_std": 2.9188578128814697, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4568866789340973, "rewards/reward_search_strategy": 0.6499999761581421, "step": 463 }, { "completion_length": 307.75, "epoch": 1.6223776223776225, "grad_norm": 1.214368462562561, "kl": 0.1455107182264328, "learning_rate": 3.2392701251101172e-06, "loss": 0.0058, "reward": 1.7257366180419922, "reward_std": 1.2823498249053955, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.42573660612106323, "rewards/reward_search_strategy": 0.30000001192092896, "step": 464 }, { "completion_length": 219.25, "epoch": 1.6258741258741258, "grad_norm": 1.7830798625946045, "kl": 0.24480244517326355, "learning_rate": 3.230929261806842e-06, "loss": 0.0098, "reward": 3.739067554473877, "reward_std": 3.551278829574585, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.38906753063201904, "rewards/reward_search_strategy": 0.3499999940395355, "step": 465 }, { "completion_length": 354.25, "epoch": 1.6293706293706294, "grad_norm": 0.9129117131233215, "kl": 0.17476214468479156, "learning_rate": 3.222579492361179e-06, "loss": 0.007, "reward": 3.277841567993164, "reward_std": 1.013387680053711, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40284180641174316, "rewards/reward_search_strategy": 0.5, "step": 466 }, { "completion_length": 374.75, "epoch": 1.632867132867133, "grad_norm": 1.2731070518493652, "kl": 0.21385666728019714, "learning_rate": 3.214220918512434e-06, "loss": 0.0086, "reward": 4.312999725341797, "reward_std": 3.5308971405029297, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.23799952864646912, "rewards/reward_search_strategy": 0.44999998807907104, "step": 467 }, { "completion_length": 760.25, "epoch": 1.6363636363636362, "grad_norm": 1.427370309829712, "kl": 0.2925739288330078, "learning_rate": 3.205853642107192e-06, "loss": 0.0117, "reward": 5.048447132110596, "reward_std": 3.4149582386016846, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.47344690561294556, "rewards/reward_search_strategy": 0.574999988079071, "step": 468 }, { "completion_length": 659.875, "epoch": 1.63986013986014, "grad_norm": 0.9429287314414978, "kl": 0.11566608399152756, "learning_rate": 3.1974777650980737e-06, "loss": 0.0046, "reward": 3.4420485496520996, "reward_std": 2.3602750301361084, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.742048442363739, "rewards/reward_search_strategy": 0.44999998807907104, "step": 469 }, { "completion_length": 639.75, "epoch": 1.6433566433566433, "grad_norm": 2.0382609367370605, "kl": 0.14500349760055542, "learning_rate": 3.189093389542498e-06, "loss": 0.0058, "reward": 4.069566249847412, "reward_std": 2.456049919128418, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.54456627368927, "rewards/reward_search_strategy": 0.6499999761581421, "step": 470 }, { "completion_length": 443.0, "epoch": 1.6468531468531469, "grad_norm": 0.672690749168396, "kl": 0.12307699769735336, "learning_rate": 3.180700617601436e-06, "loss": 0.0049, "reward": 5.947089195251465, "reward_std": 2.471454620361328, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4720890522003174, "rewards/reward_search_strategy": 0.4750000238418579, "step": 471 }, { "completion_length": 404.625, "epoch": 1.6503496503496504, "grad_norm": 0.76925128698349, "kl": 0.1666235327720642, "learning_rate": 3.1722995515381644e-06, "loss": 0.0067, "reward": 5.3943705558776855, "reward_std": 2.020536184310913, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5193708539009094, "rewards/reward_search_strategy": 0.625, "step": 472 }, { "completion_length": 538.0, "epoch": 1.6538461538461537, "grad_norm": 1.2525761127471924, "kl": 0.16103272140026093, "learning_rate": 3.1638902937170224e-06, "loss": 0.0064, "reward": 3.551636219024658, "reward_std": 2.179047107696533, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5516363978385925, "rewards/reward_search_strategy": 0.5, "step": 473 }, { "completion_length": 832.75, "epoch": 1.6573426573426573, "grad_norm": 0.3906629979610443, "kl": 0.08530028909444809, "learning_rate": 3.155472946602162e-06, "loss": 0.0034, "reward": 2.7689270973205566, "reward_std": 1.2982851266860962, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5689270496368408, "rewards/reward_search_strategy": 0.5750000476837158, "step": 474 }, { "completion_length": 530.25, "epoch": 1.6608391608391608, "grad_norm": 0.6616779565811157, "kl": 0.2439732700586319, "learning_rate": 3.147047612756302e-06, "loss": 0.0098, "reward": 1.7873218059539795, "reward_std": 1.3397150039672852, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.6123218536376953, "rewards/reward_search_strategy": 0.30000001192092896, "step": 475 }, { "completion_length": 371.875, "epoch": 1.6643356643356644, "grad_norm": 1.1894851922988892, "kl": 0.283910870552063, "learning_rate": 3.1386143948394764e-06, "loss": 0.0114, "reward": 3.811448812484741, "reward_std": 3.3633668422698975, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3864488899707794, "rewards/reward_search_strategy": 0.550000011920929, "step": 476 }, { "completion_length": 426.375, "epoch": 1.667832167832168, "grad_norm": 0.6441279649734497, "kl": 0.13555216789245605, "learning_rate": 3.130173395607785e-06, "loss": 0.0054, "reward": 4.598128318786621, "reward_std": 1.5403695106506348, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5481281280517578, "rewards/reward_search_strategy": 0.550000011920929, "step": 477 }, { "completion_length": 324.5, "epoch": 1.6713286713286712, "grad_norm": 2.565237283706665, "kl": 0.3227596580982208, "learning_rate": 3.121724717912138e-06, "loss": 0.0129, "reward": 4.640970230102539, "reward_std": 2.257411241531372, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.34097036719322205, "rewards/reward_search_strategy": 0.675000011920929, "step": 478 }, { "completion_length": 353.5, "epoch": 1.6748251748251748, "grad_norm": 0.9772915244102478, "kl": 0.19766215980052948, "learning_rate": 3.1132684646970068e-06, "loss": 0.0079, "reward": 6.462177276611328, "reward_std": 2.8117926120758057, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5121774077415466, "rewards/reward_search_strategy": 0.7000000476837158, "step": 479 }, { "completion_length": 353.25, "epoch": 1.6783216783216783, "grad_norm": 0.8048790097236633, "kl": 0.1595701426267624, "learning_rate": 3.1048047389991693e-06, "loss": 0.0064, "reward": 5.323690414428711, "reward_std": 2.957422971725464, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6236906051635742, "rewards/reward_search_strategy": 0.574999988079071, "step": 480 }, { "completion_length": 607.875, "epoch": 1.6818181818181817, "grad_norm": 0.6971364617347717, "kl": 0.15470711886882782, "learning_rate": 3.0963336439464527e-06, "loss": 0.0062, "reward": 4.107250690460205, "reward_std": 2.5991530418395996, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.48225075006484985, "rewards/reward_search_strategy": 0.625, "step": 481 }, { "completion_length": 505.0, "epoch": 1.6853146853146854, "grad_norm": 1.7211177349090576, "kl": 0.15958930552005768, "learning_rate": 3.087855282756475e-06, "loss": 0.0064, "reward": 3.6237030029296875, "reward_std": 1.4970024824142456, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5487030148506165, "rewards/reward_search_strategy": 0.574999988079071, "step": 482 }, { "completion_length": 410.5, "epoch": 1.6888111888111887, "grad_norm": 1.117256999015808, "kl": 0.22125303745269775, "learning_rate": 3.079369758735393e-06, "loss": 0.0089, "reward": 3.5827155113220215, "reward_std": 2.4601330757141113, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5577152967453003, "rewards/reward_search_strategy": 0.4000000059604645, "step": 483 }, { "completion_length": 405.625, "epoch": 1.6923076923076923, "grad_norm": 0.6825506687164307, "kl": 0.14363308250904083, "learning_rate": 3.0708771752766397e-06, "loss": 0.0057, "reward": 3.798617124557495, "reward_std": 0.7645675539970398, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4986169636249542, "rewards/reward_search_strategy": 0.550000011920929, "step": 484 }, { "completion_length": 474.625, "epoch": 1.6958041958041958, "grad_norm": 1.18376886844635, "kl": 0.17103073000907898, "learning_rate": 3.062377635859663e-06, "loss": 0.0068, "reward": 4.56157112121582, "reward_std": 3.211364984512329, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3865714967250824, "rewards/reward_search_strategy": 0.675000011920929, "step": 485 }, { "completion_length": 290.625, "epoch": 1.6993006993006992, "grad_norm": 1.1491068601608276, "kl": 0.2933157980442047, "learning_rate": 3.053871244048669e-06, "loss": 0.0117, "reward": 3.049312114715576, "reward_std": 2.6080663204193115, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.29931211471557617, "rewards/reward_search_strategy": 0.5, "step": 486 }, { "completion_length": 419.75, "epoch": 1.702797202797203, "grad_norm": 1.6442235708236694, "kl": 0.22408655285835266, "learning_rate": 3.045358103491357e-06, "loss": 0.009, "reward": 5.084389686584473, "reward_std": 2.9285178184509277, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.534389853477478, "rewards/reward_search_strategy": 0.550000011920929, "step": 487 }, { "completion_length": 405.25, "epoch": 1.7062937062937062, "grad_norm": 7.855199337005615, "kl": 2.285917043685913, "learning_rate": 3.0368383179176584e-06, "loss": 0.0914, "reward": 3.893965721130371, "reward_std": 3.1409003734588623, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1689659059047699, "rewards/reward_search_strategy": 0.4750000238418579, "step": 488 }, { "completion_length": 320.25, "epoch": 1.7097902097902098, "grad_norm": 0.8982135057449341, "kl": 0.25896814465522766, "learning_rate": 3.0283119911384724e-06, "loss": 0.0104, "reward": 4.6774492263793945, "reward_std": 2.9654905796051025, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4774494767189026, "rewards/reward_search_strategy": 0.574999988079071, "step": 489 }, { "completion_length": 326.75, "epoch": 1.7132867132867133, "grad_norm": 0.9028376340866089, "kl": 0.2348860800266266, "learning_rate": 3.019779227044398e-06, "loss": 0.0094, "reward": 4.503946304321289, "reward_std": 3.175311803817749, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.37894657254219055, "rewards/reward_search_strategy": 0.625, "step": 490 }, { "completion_length": 390.75, "epoch": 1.7167832167832167, "grad_norm": 0.7262877225875854, "kl": 0.1882961541414261, "learning_rate": 3.0112401296044756e-06, "loss": 0.0075, "reward": 2.486973762512207, "reward_std": 1.229779839515686, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4119737446308136, "rewards/reward_search_strategy": 0.44999998807907104, "step": 491 }, { "completion_length": 719.375, "epoch": 1.7202797202797204, "grad_norm": 0.6394368410110474, "kl": 0.13550426065921783, "learning_rate": 3.002694802864912e-06, "loss": 0.0054, "reward": 4.497361183166504, "reward_std": 2.7111971378326416, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.647361159324646, "rewards/reward_search_strategy": 0.6000000238418579, "step": 492 }, { "completion_length": 1683.875, "epoch": 1.7237762237762237, "grad_norm": 1.2878316640853882, "kl": 0.24534538388252258, "learning_rate": 2.9941433509478157e-06, "loss": 0.0098, "reward": 1.7936222553253174, "reward_std": 1.2284867763519287, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4436222314834595, "rewards/reward_search_strategy": 0.4750000238418579, "step": 493 }, { "completion_length": 369.5, "epoch": 1.7272727272727273, "grad_norm": 0.6680236458778381, "kl": 0.18054969608783722, "learning_rate": 2.98558587804993e-06, "loss": 0.0072, "reward": 3.908217430114746, "reward_std": 0.9750527143478394, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5332175493240356, "rewards/reward_search_strategy": 0.625, "step": 494 }, { "completion_length": 839.875, "epoch": 1.7307692307692308, "grad_norm": 0.6535410284996033, "kl": 0.13271893560886383, "learning_rate": 2.9770224884413625e-06, "loss": 0.0053, "reward": 2.350466728210449, "reward_std": 2.074483871459961, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5254667401313782, "rewards/reward_search_strategy": 0.32500001788139343, "step": 495 }, { "completion_length": 593.0, "epoch": 1.7342657342657342, "grad_norm": 0.8978703618049622, "kl": 0.22637799382209778, "learning_rate": 2.9684532864643123e-06, "loss": 0.0091, "reward": 4.150125503540039, "reward_std": 1.999029278755188, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5251256227493286, "rewards/reward_search_strategy": 0.625, "step": 496 }, { "completion_length": 728.625, "epoch": 1.737762237762238, "grad_norm": 0.7385975122451782, "kl": 0.17462122440338135, "learning_rate": 2.9598783765318005e-06, "loss": 0.007, "reward": 2.4210894107818604, "reward_std": 1.5048274993896484, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.49608945846557617, "rewards/reward_search_strategy": 0.550000011920929, "step": 497 }, { "completion_length": 657.875, "epoch": 1.7412587412587412, "grad_norm": 0.8704059720039368, "kl": 0.17740987241268158, "learning_rate": 2.9512978631264006e-06, "loss": 0.0071, "reward": 2.909700870513916, "reward_std": 1.5739132165908813, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5597009062767029, "rewards/reward_search_strategy": 0.4750000238418579, "step": 498 }, { "completion_length": 673.875, "epoch": 1.7447552447552448, "grad_norm": 0.7553533315658569, "kl": 0.17763131856918335, "learning_rate": 2.942711850798959e-06, "loss": 0.0071, "reward": 3.8633580207824707, "reward_std": 3.2166748046875, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6133580207824707, "rewards/reward_search_strategy": 0.75, "step": 499 }, { "completion_length": 601.0, "epoch": 1.7482517482517483, "grad_norm": 1.2557988166809082, "kl": 0.3829733729362488, "learning_rate": 2.9341204441673267e-06, "loss": 0.0153, "reward": 2.2739150524139404, "reward_std": 1.694032073020935, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3989151120185852, "rewards/reward_search_strategy": 0.375, "step": 500 }, { "completion_length": 699.5, "epoch": 1.7517482517482517, "grad_norm": 0.7813470959663391, "kl": 0.18124333024024963, "learning_rate": 2.9255237479150815e-06, "loss": 0.0072, "reward": 2.141329526901245, "reward_std": 1.5843234062194824, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5913295745849609, "rewards/reward_search_strategy": 0.30000001192092896, "step": 501 }, { "completion_length": 465.375, "epoch": 1.7552447552447552, "grad_norm": 1.2914066314697266, "kl": 0.19192466139793396, "learning_rate": 2.9169218667902562e-06, "loss": 0.0077, "reward": 2.826913356781006, "reward_std": 1.4041271209716797, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4269134998321533, "rewards/reward_search_strategy": 0.5249999761581421, "step": 502 }, { "completion_length": 369.0, "epoch": 1.7587412587412588, "grad_norm": 0.9004430770874023, "kl": 0.19282333552837372, "learning_rate": 2.908314905604056e-06, "loss": 0.0077, "reward": 4.80116081237793, "reward_std": 3.8360373973846436, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.42616114020347595, "rewards/reward_search_strategy": 0.625, "step": 503 }, { "completion_length": 729.75, "epoch": 1.762237762237762, "grad_norm": 0.9142323732376099, "kl": 0.1846940666437149, "learning_rate": 2.8997029692295875e-06, "loss": 0.0074, "reward": 3.3747310638427734, "reward_std": 2.6164660453796387, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3497309386730194, "rewards/reward_search_strategy": 0.5249999761581421, "step": 504 }, { "completion_length": 439.875, "epoch": 1.7657342657342658, "grad_norm": 0.5326440334320068, "kl": 0.21149645745754242, "learning_rate": 2.8910861626005774e-06, "loss": 0.0085, "reward": 7.152305603027344, "reward_std": 2.5300772190093994, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5273054838180542, "rewards/reward_search_strategy": 0.875, "step": 505 }, { "completion_length": 377.75, "epoch": 1.7692307692307692, "grad_norm": 1.7362279891967773, "kl": 0.20955699682235718, "learning_rate": 2.8824645907100957e-06, "loss": 0.0084, "reward": 4.904088020324707, "reward_std": 2.917093515396118, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.17908819019794464, "rewards/reward_search_strategy": 0.6000000238418579, "step": 506 }, { "completion_length": 390.625, "epoch": 1.7727272727272727, "grad_norm": 1.1178964376449585, "kl": 0.29615768790245056, "learning_rate": 2.8738383586092745e-06, "loss": 0.0118, "reward": 5.1932878494262695, "reward_std": 3.258514165878296, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4932880401611328, "rewards/reward_search_strategy": 0.45000001788139343, "step": 507 }, { "completion_length": 257.875, "epoch": 1.7762237762237763, "grad_norm": 1.231881022453308, "kl": 0.3791511356830597, "learning_rate": 2.8652075714060296e-06, "loss": 0.0152, "reward": 5.252076625823975, "reward_std": 3.364706039428711, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3520766496658325, "rewards/reward_search_strategy": 0.5249999761581421, "step": 508 }, { "completion_length": 620.125, "epoch": 1.7797202797202796, "grad_norm": 1.4653148651123047, "kl": 0.2104392945766449, "learning_rate": 2.8565723342637797e-06, "loss": 0.0084, "reward": 4.483345031738281, "reward_std": 3.653613805770874, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.433345228433609, "rewards/reward_search_strategy": 0.42500001192092896, "step": 509 }, { "completion_length": 392.0, "epoch": 1.7832167832167833, "grad_norm": 0.5499141216278076, "kl": 0.21998637914657593, "learning_rate": 2.847932752400164e-06, "loss": 0.0088, "reward": 3.7722690105438232, "reward_std": 0.9182682037353516, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.522269070148468, "rewards/reward_search_strategy": 0.5, "step": 510 }, { "completion_length": 223.5, "epoch": 1.7867132867132867, "grad_norm": 2.654902219772339, "kl": 2.7393767833709717, "learning_rate": 2.8392889310857615e-06, "loss": 0.1096, "reward": 3.992436408996582, "reward_std": 3.132476806640625, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3424362540245056, "rewards/reward_search_strategy": 0.4000000059604645, "step": 511 }, { "completion_length": 367.125, "epoch": 1.7902097902097902, "grad_norm": 1.111373782157898, "kl": 0.1881609857082367, "learning_rate": 2.8306409756428067e-06, "loss": 0.0075, "reward": 3.7057249546051025, "reward_std": 2.5695714950561523, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.33072489500045776, "rewards/reward_search_strategy": 0.5, "step": 512 }, { "completion_length": 479.5, "epoch": 1.7937062937062938, "grad_norm": 0.621070921421051, "kl": 0.17087478935718536, "learning_rate": 2.8219889914439073e-06, "loss": 0.0068, "reward": 4.0383405685424805, "reward_std": 2.0420751571655273, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.43834051489830017, "rewards/reward_search_strategy": 0.7249999642372131, "step": 513 }, { "completion_length": 456.125, "epoch": 1.797202797202797, "grad_norm": 0.5883731245994568, "kl": 0.13777440786361694, "learning_rate": 2.813333083910761e-06, "loss": 0.0055, "reward": 3.0201869010925293, "reward_std": 2.117656946182251, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5451867580413818, "rewards/reward_search_strategy": 0.4750000238418579, "step": 514 }, { "completion_length": 727.5, "epoch": 1.8006993006993008, "grad_norm": 0.5874714255332947, "kl": 0.13230465352535248, "learning_rate": 2.804673358512869e-06, "loss": 0.0053, "reward": 4.169018745422363, "reward_std": 2.8212528228759766, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.26901865005493164, "rewards/reward_search_strategy": 0.6499999761581421, "step": 515 }, { "completion_length": 535.0, "epoch": 1.8041958041958042, "grad_norm": 0.6631115674972534, "kl": 0.1421699821949005, "learning_rate": 2.7960099207662535e-06, "loss": 0.0057, "reward": 3.8943893909454346, "reward_std": 2.7256648540496826, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.444389283657074, "rewards/reward_search_strategy": 0.44999998807907104, "step": 516 }, { "completion_length": 424.125, "epoch": 1.8076923076923077, "grad_norm": 0.8100361227989197, "kl": 0.19577130675315857, "learning_rate": 2.7873428762321667e-06, "loss": 0.0078, "reward": 6.033158302307129, "reward_std": 2.6231906414031982, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4831581711769104, "rewards/reward_search_strategy": 0.6749999523162842, "step": 517 }, { "completion_length": 528.375, "epoch": 1.8111888111888113, "grad_norm": 1.45951247215271, "kl": 0.30490973591804504, "learning_rate": 2.778672330515814e-06, "loss": 0.0122, "reward": 4.320174217224121, "reward_std": 2.345180034637451, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.32017412781715393, "rewards/reward_search_strategy": 0.625, "step": 518 }, { "completion_length": 375.5, "epoch": 1.8146853146853146, "grad_norm": 1.146427869796753, "kl": 0.1250707358121872, "learning_rate": 2.769998389265057e-06, "loss": 0.005, "reward": 3.578047513961792, "reward_std": 1.96234929561615, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.45304757356643677, "rewards/reward_search_strategy": 0.75, "step": 519 }, { "completion_length": 535.25, "epoch": 1.8181818181818183, "grad_norm": 0.9977327585220337, "kl": 0.27235743403434753, "learning_rate": 2.761321158169134e-06, "loss": 0.0109, "reward": 4.564298629760742, "reward_std": 3.3091213703155518, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43929851055145264, "rewards/reward_search_strategy": 0.625, "step": 520 }, { "completion_length": 231.125, "epoch": 1.8216783216783217, "grad_norm": 1.618683934211731, "kl": 0.2755619287490845, "learning_rate": 2.752640742957366e-06, "loss": 0.011, "reward": 4.865569114685059, "reward_std": 3.661264419555664, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39056915044784546, "rewards/reward_search_strategy": 0.4750000238418579, "step": 521 }, { "completion_length": 636.625, "epoch": 1.8251748251748252, "grad_norm": 0.583243191242218, "kl": 0.16021624207496643, "learning_rate": 2.743957249397874e-06, "loss": 0.0064, "reward": 4.629208564758301, "reward_std": 2.8494086265563965, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4792088270187378, "rewards/reward_search_strategy": 0.7749999761581421, "step": 522 }, { "completion_length": 489.75, "epoch": 1.8286713286713288, "grad_norm": 0.998420000076294, "kl": 0.26722487807273865, "learning_rate": 2.7352707832962865e-06, "loss": 0.0107, "reward": 4.075039386749268, "reward_std": 1.9532568454742432, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3500395119190216, "rewards/reward_search_strategy": 0.4749999940395355, "step": 523 }, { "completion_length": 437.125, "epoch": 1.832167832167832, "grad_norm": 3.2135281562805176, "kl": 0.21277548372745514, "learning_rate": 2.726581450494451e-06, "loss": 0.0085, "reward": 2.2732386589050293, "reward_std": 1.2628512382507324, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4982386827468872, "rewards/reward_search_strategy": 0.5249999761581421, "step": 524 }, { "completion_length": 366.375, "epoch": 1.8356643356643356, "grad_norm": 1.3392611742019653, "kl": 0.24789753556251526, "learning_rate": 2.717889356869146e-06, "loss": 0.0099, "reward": 5.1830363273620605, "reward_std": 2.9729130268096924, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.25803637504577637, "rewards/reward_search_strategy": 0.550000011920929, "step": 525 }, { "completion_length": 509.875, "epoch": 1.8391608391608392, "grad_norm": 0.5417003035545349, "kl": 0.1483701765537262, "learning_rate": 2.70919460833079e-06, "loss": 0.0059, "reward": 7.1703081130981445, "reward_std": 2.587738037109375, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7453081607818604, "rewards/reward_search_strategy": 0.800000011920929, "step": 526 }, { "completion_length": 367.75, "epoch": 1.8426573426573427, "grad_norm": 0.8764938116073608, "kl": 0.2148994356393814, "learning_rate": 2.700497310822147e-06, "loss": 0.0086, "reward": 5.292134761810303, "reward_std": 3.6617748737335205, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5421345829963684, "rewards/reward_search_strategy": 0.625, "step": 527 }, { "completion_length": 278.875, "epoch": 1.8461538461538463, "grad_norm": 0.9607499837875366, "kl": 0.23589320480823517, "learning_rate": 2.6917975703170466e-06, "loss": 0.0094, "reward": 4.2339372634887695, "reward_std": 2.0283195972442627, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4589373767375946, "rewards/reward_search_strategy": 0.7749999761581421, "step": 528 }, { "completion_length": 363.75, "epoch": 1.8496503496503496, "grad_norm": 0.7568937540054321, "kl": 0.1478801816701889, "learning_rate": 2.6830954928190795e-06, "loss": 0.0059, "reward": 4.656356334686279, "reward_std": 2.3339731693267822, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4563564360141754, "rewards/reward_search_strategy": 0.699999988079071, "step": 529 }, { "completion_length": 418.125, "epoch": 1.8531468531468531, "grad_norm": 0.73731529712677, "kl": 0.1510273814201355, "learning_rate": 2.6743911843603134e-06, "loss": 0.006, "reward": 4.6169233322143555, "reward_std": 2.509160041809082, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5919233560562134, "rewards/reward_search_strategy": 0.6499999761581421, "step": 530 }, { "completion_length": 393.625, "epoch": 1.8566433566433567, "grad_norm": 0.985019326210022, "kl": 0.25836557149887085, "learning_rate": 2.6656847510000013e-06, "loss": 0.0103, "reward": 3.9243826866149902, "reward_std": 2.617119073867798, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3493826687335968, "rewards/reward_search_strategy": 0.44999998807907104, "step": 531 }, { "completion_length": 274.625, "epoch": 1.86013986013986, "grad_norm": 1.0898783206939697, "kl": 0.2563922703266144, "learning_rate": 2.6569762988232838e-06, "loss": 0.0103, "reward": 3.603743076324463, "reward_std": 3.3509886264801025, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.42874306440353394, "rewards/reward_search_strategy": 0.42500001192092896, "step": 532 }, { "completion_length": 374.25, "epoch": 1.8636363636363638, "grad_norm": 0.7089191675186157, "kl": 0.15728232264518738, "learning_rate": 2.6482659339399047e-06, "loss": 0.0063, "reward": 4.477728366851807, "reward_std": 1.6571000814437866, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.35272836685180664, "rewards/reward_search_strategy": 0.75, "step": 533 }, { "completion_length": 473.0, "epoch": 1.867132867132867, "grad_norm": 0.6451985239982605, "kl": 0.1318826675415039, "learning_rate": 2.63955376248291e-06, "loss": 0.0053, "reward": 5.787966728210449, "reward_std": 2.74002742767334, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5629667639732361, "rewards/reward_search_strategy": 0.6000000238418579, "step": 534 }, { "completion_length": 436.375, "epoch": 1.8706293706293706, "grad_norm": 1.4988572597503662, "kl": 0.18932144343852997, "learning_rate": 2.6308398906073603e-06, "loss": 0.0076, "reward": 5.46368408203125, "reward_std": 2.972785711288452, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4886842966079712, "rewards/reward_search_strategy": 0.6000000238418579, "step": 535 }, { "completion_length": 355.375, "epoch": 1.8741258741258742, "grad_norm": 0.7932916879653931, "kl": 0.2548842132091522, "learning_rate": 2.6221244244890336e-06, "loss": 0.0102, "reward": 6.01389217376709, "reward_std": 2.8930044174194336, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4138926863670349, "rewards/reward_search_strategy": 0.6000000238418579, "step": 536 }, { "completion_length": 306.125, "epoch": 1.8776223776223775, "grad_norm": 1.055395483970642, "kl": 0.30341437458992004, "learning_rate": 2.613407470323134e-06, "loss": 0.0121, "reward": 5.066512107849121, "reward_std": 2.5190961360931396, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.21651223301887512, "rewards/reward_search_strategy": 0.6000000238418579, "step": 537 }, { "completion_length": 929.25, "epoch": 1.8811188811188813, "grad_norm": 0.603378176689148, "kl": 0.1319848597049713, "learning_rate": 2.604689134322999e-06, "loss": 0.0053, "reward": 4.212712287902832, "reward_std": 2.048562526702881, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48771214485168457, "rewards/reward_search_strategy": 0.4750000238418579, "step": 538 }, { "completion_length": 777.625, "epoch": 1.8846153846153846, "grad_norm": 0.805946409702301, "kl": 0.1326817125082016, "learning_rate": 2.5959695227188e-06, "loss": 0.0053, "reward": 3.8090720176696777, "reward_std": 2.5989437103271484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6590719223022461, "rewards/reward_search_strategy": 0.5249999761581421, "step": 539 }, { "completion_length": 604.0, "epoch": 1.8881118881118881, "grad_norm": 0.7141540050506592, "kl": 0.15499770641326904, "learning_rate": 2.587248741756253e-06, "loss": 0.0062, "reward": 5.391590118408203, "reward_std": 3.8308281898498535, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4915906488895416, "rewards/reward_search_strategy": 0.6499999761581421, "step": 540 }, { "completion_length": 426.875, "epoch": 1.8916083916083917, "grad_norm": 0.6566001772880554, "kl": 0.11781422793865204, "learning_rate": 2.578526897695321e-06, "loss": 0.0047, "reward": 6.018224716186523, "reward_std": 2.3312177658081055, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.41822493076324463, "rewards/reward_search_strategy": 0.6000000238418579, "step": 541 }, { "completion_length": 311.25, "epoch": 1.895104895104895, "grad_norm": 1.291000247001648, "kl": 0.2666287422180176, "learning_rate": 2.569804096808923e-06, "loss": 0.0107, "reward": 5.1927337646484375, "reward_std": 2.755157470703125, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3927338719367981, "rewards/reward_search_strategy": 0.550000011920929, "step": 542 }, { "completion_length": 305.25, "epoch": 1.8986013986013988, "grad_norm": 0.8758834600448608, "kl": 0.21868717670440674, "learning_rate": 2.5610804453816333e-06, "loss": 0.0087, "reward": 3.3073205947875977, "reward_std": 0.9015099406242371, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5323205590248108, "rewards/reward_search_strategy": 0.6499999761581421, "step": 543 }, { "completion_length": 392.5, "epoch": 1.902097902097902, "grad_norm": 1.5083352327346802, "kl": 0.21986426413059235, "learning_rate": 2.5523560497083927e-06, "loss": 0.0088, "reward": 7.508007049560547, "reward_std": 1.417626142501831, "rewards/reward_correctness": 1.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.008006956428289413, "rewards/reward_search_strategy": 0.875, "step": 544 }, { "completion_length": 394.0, "epoch": 1.9055944055944056, "grad_norm": 1.5424015522003174, "kl": 0.17503148317337036, "learning_rate": 2.543631016093209e-06, "loss": 0.007, "reward": 4.247199058532715, "reward_std": 1.7720770835876465, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4471993148326874, "rewards/reward_search_strategy": 0.550000011920929, "step": 545 }, { "completion_length": 386.0, "epoch": 1.9090909090909092, "grad_norm": 0.7861531972885132, "kl": 0.20189355313777924, "learning_rate": 2.5349054508478636e-06, "loss": 0.0081, "reward": 5.381592750549316, "reward_std": 2.4070920944213867, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48159298300743103, "rewards/reward_search_strategy": 0.6499999761581421, "step": 546 }, { "completion_length": 340.875, "epoch": 1.9125874125874125, "grad_norm": 0.8309083580970764, "kl": 0.2448170930147171, "learning_rate": 2.526179460290615e-06, "loss": 0.0098, "reward": 3.613577365875244, "reward_std": 2.6209349632263184, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5135773420333862, "rewards/reward_search_strategy": 0.6000000238418579, "step": 547 }, { "completion_length": 663.5, "epoch": 1.916083916083916, "grad_norm": 0.5724729895591736, "kl": 0.1197650209069252, "learning_rate": 2.517453150744904e-06, "loss": 0.0048, "reward": 3.980480909347534, "reward_std": 2.2730460166931152, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.55548095703125, "rewards/reward_search_strategy": 0.675000011920929, "step": 548 }, { "completion_length": 272.75, "epoch": 1.9195804195804196, "grad_norm": 1.6417477130889893, "kl": 0.28812453150749207, "learning_rate": 2.5087266285380597e-06, "loss": 0.0115, "reward": 2.912532329559326, "reward_std": 1.6371732950210571, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.23753249645233154, "rewards/reward_search_strategy": 0.5499999523162842, "step": 549 }, { "completion_length": 305.0, "epoch": 1.9230769230769231, "grad_norm": 6.134403228759766, "kl": 0.8665672540664673, "learning_rate": 2.5e-06, "loss": 0.0347, "reward": 4.181793212890625, "reward_std": 2.829068183898926, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4317927658557892, "rewards/reward_search_strategy": 0.625, "step": 550 }, { "completion_length": 298.0, "epoch": 1.9265734265734267, "grad_norm": 0.9454349279403687, "kl": 0.11578426510095596, "learning_rate": 2.4912733714619415e-06, "loss": 0.0046, "reward": 3.1753480434417725, "reward_std": 1.6148889064788818, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6503480672836304, "rewards/reward_search_strategy": 0.5250000357627869, "step": 551 }, { "completion_length": 278.375, "epoch": 1.93006993006993, "grad_norm": 1.9904608726501465, "kl": 0.2013421654701233, "learning_rate": 2.482546849255096e-06, "loss": 0.0081, "reward": 2.3399460315704346, "reward_std": 2.094362258911133, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2149459570646286, "rewards/reward_search_strategy": 0.5, "step": 552 }, { "completion_length": 384.625, "epoch": 1.9335664335664335, "grad_norm": 0.7055719494819641, "kl": 0.16155293583869934, "learning_rate": 2.4738205397093863e-06, "loss": 0.0065, "reward": 5.295562744140625, "reward_std": 2.516092538833618, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.27056291699409485, "rewards/reward_search_strategy": 0.6499999761581421, "step": 553 }, { "completion_length": 403.375, "epoch": 1.937062937062937, "grad_norm": 0.6282983422279358, "kl": 0.10670147091150284, "learning_rate": 2.4650945491521372e-06, "loss": 0.0043, "reward": 3.8357903957366943, "reward_std": 1.950538992881775, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.11079052835702896, "rewards/reward_search_strategy": 0.7250000238418579, "step": 554 }, { "completion_length": 371.125, "epoch": 1.9405594405594404, "grad_norm": 0.9865713119506836, "kl": 0.13365398347377777, "learning_rate": 2.4563689839067913e-06, "loss": 0.0053, "reward": 4.407708168029785, "reward_std": 1.5128272771835327, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4577085077762604, "rewards/reward_search_strategy": 0.44999998807907104, "step": 555 }, { "completion_length": 343.875, "epoch": 1.9440559440559442, "grad_norm": 2.774280548095703, "kl": 0.4545011520385742, "learning_rate": 2.447643950291608e-06, "loss": 0.0182, "reward": 5.597696781158447, "reward_std": 2.1334950923919678, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5226967334747314, "rewards/reward_search_strategy": 0.5750000476837158, "step": 556 }, { "completion_length": 439.25, "epoch": 1.9475524475524475, "grad_norm": 0.7445939779281616, "kl": 0.14428403973579407, "learning_rate": 2.4389195546183676e-06, "loss": 0.0058, "reward": 3.189345598220825, "reward_std": 1.6320502758026123, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3643457889556885, "rewards/reward_search_strategy": 0.5750000476837158, "step": 557 }, { "completion_length": 406.0, "epoch": 1.951048951048951, "grad_norm": 0.6262521147727966, "kl": 0.16001485288143158, "learning_rate": 2.4301959031910785e-06, "loss": 0.0064, "reward": 5.268270969390869, "reward_std": 2.528679609298706, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.39327099919319153, "rewards/reward_search_strategy": 0.75, "step": 558 }, { "completion_length": 571.125, "epoch": 1.9545454545454546, "grad_norm": 1.8445830345153809, "kl": 0.2330249845981598, "learning_rate": 2.4214731023046795e-06, "loss": 0.0093, "reward": 2.626037120819092, "reward_std": 2.412799596786499, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3260372579097748, "rewards/reward_search_strategy": 0.42499998211860657, "step": 559 }, { "completion_length": 484.5, "epoch": 1.958041958041958, "grad_norm": 1.1257758140563965, "kl": 0.15154430270195007, "learning_rate": 2.4127512582437486e-06, "loss": 0.0061, "reward": 2.8468713760375977, "reward_std": 2.2975640296936035, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.42187127470970154, "rewards/reward_search_strategy": 0.42500001192092896, "step": 560 }, { "completion_length": 293.75, "epoch": 1.9615384615384617, "grad_norm": 3.8214211463928223, "kl": 0.45248979330062866, "learning_rate": 2.4040304772812002e-06, "loss": 0.0181, "reward": 3.833535671234131, "reward_std": 3.828199863433838, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.48353567719459534, "rewards/reward_search_strategy": 0.3500000238418579, "step": 561 }, { "completion_length": 335.5, "epoch": 1.965034965034965, "grad_norm": 1.3766762018203735, "kl": 0.2158210724592209, "learning_rate": 2.3953108656770018e-06, "loss": 0.0086, "reward": 4.993716239929199, "reward_std": 2.7581541538238525, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.018716657534241676, "rewards/reward_search_strategy": 0.7250000238418579, "step": 562 }, { "completion_length": 497.75, "epoch": 1.9685314685314685, "grad_norm": 1.5413566827774048, "kl": 0.2473594844341278, "learning_rate": 2.3865925296768658e-06, "loss": 0.0099, "reward": 2.4500536918640137, "reward_std": 1.235512137413025, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40005365014076233, "rewards/reward_search_strategy": 0.550000011920929, "step": 563 }, { "completion_length": 401.625, "epoch": 1.972027972027972, "grad_norm": 0.9273636937141418, "kl": 0.21244309842586517, "learning_rate": 2.377875575510967e-06, "loss": 0.0085, "reward": 4.193811893463135, "reward_std": 0.9206790328025818, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6438118815422058, "rewards/reward_search_strategy": 0.550000011920929, "step": 564 }, { "completion_length": 536.0, "epoch": 1.9755244755244754, "grad_norm": 1.1288378238677979, "kl": 0.22064147889614105, "learning_rate": 2.3691601093926406e-06, "loss": 0.0088, "reward": 3.3308537006378174, "reward_std": 3.206730365753174, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.38085371255874634, "rewards/reward_search_strategy": 0.32499998807907104, "step": 565 }, { "completion_length": 478.125, "epoch": 1.9790209790209792, "grad_norm": 0.6461117267608643, "kl": 0.12631751596927643, "learning_rate": 2.3604462375170905e-06, "loss": 0.0051, "reward": 2.805847406387329, "reward_std": 2.364455461502075, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.6058473587036133, "rewards/reward_search_strategy": 0.44999998807907104, "step": 566 }, { "completion_length": 229.5, "epoch": 1.9825174825174825, "grad_norm": 0.9939329624176025, "kl": 0.25228554010391235, "learning_rate": 2.3517340660600965e-06, "loss": 0.0101, "reward": 5.576767921447754, "reward_std": 3.05527663230896, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.47676780819892883, "rewards/reward_search_strategy": 0.3499999940395355, "step": 567 }, { "completion_length": 445.125, "epoch": 1.986013986013986, "grad_norm": 1.2215253114700317, "kl": 0.19178111851215363, "learning_rate": 2.3430237011767166e-06, "loss": 0.0077, "reward": 2.854430913925171, "reward_std": 2.5454559326171875, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3544308841228485, "rewards/reward_search_strategy": 0.5, "step": 568 }, { "completion_length": 453.125, "epoch": 1.9895104895104896, "grad_norm": 1.3764644861221313, "kl": 0.23443952202796936, "learning_rate": 2.3343152490000004e-06, "loss": 0.0094, "reward": 3.74816632270813, "reward_std": 2.4638051986694336, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.2731662094593048, "rewards/reward_search_strategy": 0.7249999642372131, "step": 569 }, { "completion_length": 728.375, "epoch": 1.993006993006993, "grad_norm": 0.7834612727165222, "kl": 0.17851339280605316, "learning_rate": 2.325608815639687e-06, "loss": 0.0071, "reward": 3.360065221786499, "reward_std": 2.5360162258148193, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.48506492376327515, "rewards/reward_search_strategy": 0.5, "step": 570 }, { "completion_length": 317.125, "epoch": 1.9965034965034965, "grad_norm": 0.8286842107772827, "kl": 0.2042827308177948, "learning_rate": 2.3169045071809217e-06, "loss": 0.0082, "reward": 3.511277198791504, "reward_std": 3.0144357681274414, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.38627690076828003, "rewards/reward_search_strategy": 0.625, "step": 571 }, { "completion_length": 438.0, "epoch": 2.0, "grad_norm": 1.4197484254837036, "kl": 0.22094160318374634, "learning_rate": 2.3082024296829538e-06, "loss": 0.0088, "reward": 3.823275089263916, "reward_std": 2.203106164932251, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.22327494621276855, "rewards/reward_search_strategy": 0.6000000238418579, "step": 572 }, { "completion_length": 528.125, "epoch": 2.0034965034965033, "grad_norm": 1.1150870323181152, "kl": 0.16079114377498627, "learning_rate": 2.2995026891778533e-06, "loss": 0.0064, "reward": 3.081960916519165, "reward_std": 2.8381266593933105, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4069609045982361, "rewards/reward_search_strategy": 0.42500001192092896, "step": 573 }, { "completion_length": 447.375, "epoch": 2.006993006993007, "grad_norm": 2.044112205505371, "kl": 0.23929806053638458, "learning_rate": 2.290805391669212e-06, "loss": 0.0096, "reward": 3.1448910236358643, "reward_std": 1.6620686054229736, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4448910355567932, "rewards/reward_search_strategy": 0.574999988079071, "step": 574 }, { "completion_length": 344.125, "epoch": 2.0104895104895104, "grad_norm": 1.1256439685821533, "kl": 0.40603435039520264, "learning_rate": 2.2821106431308546e-06, "loss": 0.0162, "reward": 3.590481996536255, "reward_std": 3.215083360671997, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3404819071292877, "rewards/reward_search_strategy": 0.5, "step": 575 }, { "completion_length": 365.875, "epoch": 2.013986013986014, "grad_norm": 0.9045865535736084, "kl": 0.21638526022434235, "learning_rate": 2.2734185495055503e-06, "loss": 0.0087, "reward": 5.558139324188232, "reward_std": 2.632394552230835, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.6581395864486694, "rewards/reward_search_strategy": 0.6499999761581421, "step": 576 }, { "completion_length": 362.0, "epoch": 2.0174825174825175, "grad_norm": 1.3176411390304565, "kl": 0.29657936096191406, "learning_rate": 2.2647292167037143e-06, "loss": 0.0119, "reward": 3.193131685256958, "reward_std": 3.028653621673584, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39313167333602905, "rewards/reward_search_strategy": 0.42500001192092896, "step": 577 }, { "completion_length": 613.375, "epoch": 2.020979020979021, "grad_norm": 0.735613226890564, "kl": 0.17025895416736603, "learning_rate": 2.256042750602127e-06, "loss": 0.0068, "reward": 4.768100261688232, "reward_std": 1.8358144760131836, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.643100380897522, "rewards/reward_search_strategy": 0.625, "step": 578 }, { "completion_length": 657.125, "epoch": 2.0244755244755246, "grad_norm": 0.9518574476242065, "kl": 0.14201372861862183, "learning_rate": 2.2473592570426343e-06, "loss": 0.0057, "reward": 4.414166450500488, "reward_std": 2.766263484954834, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.1641668975353241, "rewards/reward_search_strategy": 0.5, "step": 579 }, { "completion_length": 519.875, "epoch": 2.027972027972028, "grad_norm": 1.9180530309677124, "kl": 0.39803510904312134, "learning_rate": 2.238678841830867e-06, "loss": 0.0159, "reward": 2.850477695465088, "reward_std": 2.8477377891540527, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4254775941371918, "rewards/reward_search_strategy": 0.30000001192092896, "step": 580 }, { "completion_length": 263.125, "epoch": 2.0314685314685317, "grad_norm": 1.2091302871704102, "kl": 0.26268723607063293, "learning_rate": 2.230001610734943e-06, "loss": 0.0105, "reward": 3.0431175231933594, "reward_std": 0.7641079425811768, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.568117618560791, "rewards/reward_search_strategy": 0.7249999642372131, "step": 581 }, { "completion_length": 236.625, "epoch": 2.034965034965035, "grad_norm": 1.5325607061386108, "kl": 0.44615864753723145, "learning_rate": 2.2213276694841866e-06, "loss": 0.0178, "reward": 3.520362377166748, "reward_std": 3.250920295715332, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.22036239504814148, "rewards/reward_search_strategy": 0.42500001192092896, "step": 582 }, { "completion_length": 511.875, "epoch": 2.0384615384615383, "grad_norm": 0.9509716629981995, "kl": 0.23918908834457397, "learning_rate": 2.212657123767834e-06, "loss": 0.0096, "reward": 5.3234076499938965, "reward_std": 3.40258526802063, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.29840800166130066, "rewards/reward_search_strategy": 0.6499999761581421, "step": 583 }, { "completion_length": 608.5, "epoch": 2.041958041958042, "grad_norm": 0.7608333230018616, "kl": 0.17903365194797516, "learning_rate": 2.2039900792337477e-06, "loss": 0.0072, "reward": 3.5384345054626465, "reward_std": 0.9120908379554749, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6134345531463623, "rewards/reward_search_strategy": 0.675000011920929, "step": 584 }, { "completion_length": 319.125, "epoch": 2.0454545454545454, "grad_norm": 1.3616375923156738, "kl": 0.3396897614002228, "learning_rate": 2.195326641487132e-06, "loss": 0.0136, "reward": 3.8816871643066406, "reward_std": 3.524160623550415, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4816873073577881, "rewards/reward_search_strategy": 0.5249999761581421, "step": 585 }, { "completion_length": 298.625, "epoch": 2.0489510489510487, "grad_norm": 1.714475154876709, "kl": 0.467647522687912, "learning_rate": 2.186666916089239e-06, "loss": 0.0187, "reward": 3.5826728343963623, "reward_std": 3.505326747894287, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3826729357242584, "rewards/reward_search_strategy": 0.44999998807907104, "step": 586 }, { "completion_length": 329.0, "epoch": 2.0524475524475525, "grad_norm": 2.293823480606079, "kl": 0.3535889685153961, "learning_rate": 2.1780110085560935e-06, "loss": 0.0141, "reward": 3.534921169281006, "reward_std": 2.731309652328491, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40992122888565063, "rewards/reward_search_strategy": 0.5, "step": 587 }, { "completion_length": 305.0, "epoch": 2.055944055944056, "grad_norm": 1.2662256956100464, "kl": 0.56394362449646, "learning_rate": 2.1693590243571937e-06, "loss": 0.0226, "reward": 3.4940059185028076, "reward_std": 2.9306962490081787, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.19400593638420105, "rewards/reward_search_strategy": 0.42500001192092896, "step": 588 }, { "completion_length": 755.75, "epoch": 2.0594405594405596, "grad_norm": 0.6191352605819702, "kl": 0.21323426067829132, "learning_rate": 2.1607110689142393e-06, "loss": 0.0085, "reward": 3.349336624145508, "reward_std": 2.5924930572509766, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3493368625640869, "rewards/reward_search_strategy": 0.375, "step": 589 }, { "completion_length": 618.375, "epoch": 2.062937062937063, "grad_norm": 1.132805585861206, "kl": 0.39246758818626404, "learning_rate": 2.1520672475998374e-06, "loss": 0.0157, "reward": 2.791980743408203, "reward_std": 1.946831226348877, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.5919808149337769, "rewards/reward_search_strategy": 0.45000001788139343, "step": 590 }, { "completion_length": 1027.625, "epoch": 2.0664335664335662, "grad_norm": 0.8593956828117371, "kl": 0.24245135486125946, "learning_rate": 2.143427665736221e-06, "loss": 0.0097, "reward": 2.7288267612457275, "reward_std": 2.470308303833008, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.5038267970085144, "rewards/reward_search_strategy": 0.3500000238418579, "step": 591 }, { "completion_length": 446.75, "epoch": 2.06993006993007, "grad_norm": 2.004469633102417, "kl": 0.5076056122779846, "learning_rate": 2.134792428593971e-06, "loss": 0.0203, "reward": 4.7919745445251465, "reward_std": 2.649038791656494, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4669745862483978, "rewards/reward_search_strategy": 0.45000001788139343, "step": 592 }, { "completion_length": 299.625, "epoch": 2.0734265734265733, "grad_norm": 1.6865434646606445, "kl": 0.4611359238624573, "learning_rate": 2.1261616413907267e-06, "loss": 0.0184, "reward": 2.921395778656006, "reward_std": 2.20475697517395, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4463959336280823, "rewards/reward_search_strategy": 0.4749999940395355, "step": 593 }, { "completion_length": 246.75, "epoch": 2.076923076923077, "grad_norm": 1.3372602462768555, "kl": 0.3958607316017151, "learning_rate": 2.117535409289905e-06, "loss": 0.0158, "reward": 3.5213401317596436, "reward_std": 3.8686110973358154, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.37134024500846863, "rewards/reward_search_strategy": 0.2750000059604645, "step": 594 }, { "completion_length": 248.375, "epoch": 2.0804195804195804, "grad_norm": 1.687395453453064, "kl": 0.48411720991134644, "learning_rate": 2.1089138373994226e-06, "loss": 0.0194, "reward": 2.76412034034729, "reward_std": 2.2025575637817383, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3891204297542572, "rewards/reward_search_strategy": 0.375, "step": 595 }, { "completion_length": 440.375, "epoch": 2.0839160839160837, "grad_norm": 5.519715309143066, "kl": 2.0341320037841797, "learning_rate": 2.1002970307704134e-06, "loss": 0.0814, "reward": 2.955754041671753, "reward_std": 2.849426031112671, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.18075412511825562, "rewards/reward_search_strategy": 0.4000000059604645, "step": 596 }, { "completion_length": 457.25, "epoch": 2.0874125874125875, "grad_norm": 0.6264410018920898, "kl": 0.43496525287628174, "learning_rate": 2.0916850943959453e-06, "loss": 0.0174, "reward": 3.4208319187164307, "reward_std": 1.2662279605865479, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4708319306373596, "rewards/reward_search_strategy": 0.44999998807907104, "step": 597 }, { "completion_length": 537.75, "epoch": 2.090909090909091, "grad_norm": 0.9656305909156799, "kl": 0.3533865511417389, "learning_rate": 2.0830781332097446e-06, "loss": 0.0141, "reward": 2.179312229156494, "reward_std": 1.0181849002838135, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.32931211590766907, "rewards/reward_search_strategy": 0.4749999940395355, "step": 598 }, { "completion_length": 683.375, "epoch": 2.0944055944055946, "grad_norm": 1.1000615358352661, "kl": 0.28707969188690186, "learning_rate": 2.0744762520849193e-06, "loss": 0.0115, "reward": 2.6007080078125, "reward_std": 2.0923187732696533, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6257079243659973, "rewards/reward_search_strategy": 0.3500000238418579, "step": 599 }, { "completion_length": 278.25, "epoch": 2.097902097902098, "grad_norm": 0.84709233045578, "kl": 0.35083308815956116, "learning_rate": 2.0658795558326745e-06, "loss": 0.014, "reward": 4.950100898742676, "reward_std": 2.8403358459472656, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.45010069012641907, "rewards/reward_search_strategy": 0.3750000298023224, "step": 600 }, { "completion_length": 620.375, "epoch": 2.1013986013986012, "grad_norm": 2.721386671066284, "kl": 0.4919606149196625, "learning_rate": 2.0572881492010423e-06, "loss": 0.0197, "reward": 1.8211491107940674, "reward_std": 1.338912010192871, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.396149218082428, "rewards/reward_search_strategy": 0.42500001192092896, "step": 601 }, { "completion_length": 429.875, "epoch": 2.104895104895105, "grad_norm": 2.013206720352173, "kl": 0.815765380859375, "learning_rate": 2.0487021368736002e-06, "loss": 0.0326, "reward": 1.7140884399414062, "reward_std": 1.3734639883041382, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3640882968902588, "rewards/reward_search_strategy": 0.3500000238418579, "step": 602 }, { "completion_length": 244.25, "epoch": 2.1083916083916083, "grad_norm": 1.3675538301467896, "kl": 0.43126821517944336, "learning_rate": 2.0401216234682e-06, "loss": 0.0173, "reward": 3.1364965438842773, "reward_std": 1.2145814895629883, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43649643659591675, "rewards/reward_search_strategy": 0.45000001788139343, "step": 603 }, { "completion_length": 665.625, "epoch": 2.111888111888112, "grad_norm": 0.9919797778129578, "kl": 0.24450455605983734, "learning_rate": 2.031546713535688e-06, "loss": 0.0098, "reward": 4.791861534118652, "reward_std": 3.1218600273132324, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.7168612480163574, "rewards/reward_search_strategy": 0.32500001788139343, "step": 604 }, { "completion_length": 439.125, "epoch": 2.1153846153846154, "grad_norm": 2.6146435737609863, "kl": 1.0374516248703003, "learning_rate": 2.022977511558638e-06, "loss": 0.0415, "reward": 2.1190848350524902, "reward_std": 1.5598971843719482, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.19408495724201202, "rewards/reward_search_strategy": 0.17500001192092896, "step": 605 }, { "completion_length": 211.75, "epoch": 2.1188811188811187, "grad_norm": 1.2216012477874756, "kl": 0.6190358996391296, "learning_rate": 2.0144141219500707e-06, "loss": 0.0248, "reward": 1.935401439666748, "reward_std": 1.185782551765442, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.28540149331092834, "rewards/reward_search_strategy": 0.40000003576278687, "step": 606 }, { "completion_length": 123.25, "epoch": 2.1223776223776225, "grad_norm": 2.1749942302703857, "kl": 0.8067781925201416, "learning_rate": 2.0058566490521848e-06, "loss": 0.0323, "reward": 1.869102954864502, "reward_std": 2.623877763748169, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.2941029667854309, "rewards/reward_search_strategy": 0.20000000298023224, "step": 607 }, { "completion_length": 301.5, "epoch": 2.125874125874126, "grad_norm": 3.136425733566284, "kl": 0.8475527763366699, "learning_rate": 1.997305197135089e-06, "loss": 0.0339, "reward": 1.8826358318328857, "reward_std": 2.2515580654144287, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.15763583779335022, "rewards/reward_search_strategy": 0.22500000894069672, "step": 608 }, { "completion_length": 524.125, "epoch": 2.129370629370629, "grad_norm": 2.439591407775879, "kl": 0.6721910834312439, "learning_rate": 1.9887598703955244e-06, "loss": 0.0269, "reward": 0.996848464012146, "reward_std": 1.0712072849273682, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.3468484580516815, "rewards/reward_search_strategy": 0.15000000596046448, "step": 609 }, { "completion_length": 588.125, "epoch": 2.132867132867133, "grad_norm": 1.9819761514663696, "kl": 0.3836514353752136, "learning_rate": 1.9802207729556023e-06, "loss": 0.0153, "reward": 2.9225618839263916, "reward_std": 1.442522644996643, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5475618243217468, "rewards/reward_search_strategy": 0.25, "step": 610 }, { "completion_length": 324.125, "epoch": 2.1363636363636362, "grad_norm": 1.9776884317398071, "kl": 1.3968545198440552, "learning_rate": 1.971688008861529e-06, "loss": 0.0559, "reward": 1.0356595516204834, "reward_std": 1.2683775424957275, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.3356595039367676, "rewards/reward_search_strategy": 0.20000000298023224, "step": 611 }, { "completion_length": 682.875, "epoch": 2.13986013986014, "grad_norm": 0.6471878290176392, "kl": 0.2545889914035797, "learning_rate": 1.963161682082342e-06, "loss": 0.0102, "reward": 2.734546661376953, "reward_std": 1.6279962062835693, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3595465421676636, "rewards/reward_search_strategy": 0.375, "step": 612 }, { "completion_length": 239.25, "epoch": 2.1433566433566433, "grad_norm": 1.6059681177139282, "kl": 0.6571323275566101, "learning_rate": 1.9546418965086444e-06, "loss": 0.0263, "reward": 2.048656463623047, "reward_std": 1.680714726448059, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3236566185951233, "rewards/reward_search_strategy": 0.3500000238418579, "step": 613 }, { "completion_length": 247.5, "epoch": 2.1468531468531467, "grad_norm": 2.003173351287842, "kl": 0.8130755424499512, "learning_rate": 1.946128755951332e-06, "loss": 0.0325, "reward": 1.757265567779541, "reward_std": 1.6534045934677124, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30726563930511475, "rewards/reward_search_strategy": 0.32500001788139343, "step": 614 }, { "completion_length": 636.125, "epoch": 2.1503496503496504, "grad_norm": 1.3049726486206055, "kl": 0.8581527471542358, "learning_rate": 1.937622364140338e-06, "loss": 0.0343, "reward": 1.4249939918518066, "reward_std": 1.3708144426345825, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.24999403953552246, "rewards/reward_search_strategy": 0.17500001192092896, "step": 615 }, { "completion_length": 1154.375, "epoch": 2.1538461538461537, "grad_norm": 0.4864519238471985, "kl": 0.18407891690731049, "learning_rate": 1.9291228247233607e-06, "loss": 0.0074, "reward": 1.4323234558105469, "reward_std": 1.0012611150741577, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.40732336044311523, "rewards/reward_search_strategy": 0.2750000059604645, "step": 616 }, { "completion_length": 104.75, "epoch": 2.1573426573426575, "grad_norm": 3.3319404125213623, "kl": 1.3707588911056519, "learning_rate": 1.9206302412646074e-06, "loss": 0.0548, "reward": 3.4996678829193115, "reward_std": 3.602128028869629, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2746679186820984, "rewards/reward_search_strategy": 0.22500000894069672, "step": 617 }, { "completion_length": 1008.625, "epoch": 2.160839160839161, "grad_norm": 0.85741126537323, "kl": 0.5215395092964172, "learning_rate": 1.912144717243525e-06, "loss": 0.0209, "reward": 2.4926795959472656, "reward_std": 2.4891631603240967, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.5426796674728394, "rewards/reward_search_strategy": 0.32499998807907104, "step": 618 }, { "completion_length": 742.375, "epoch": 2.164335664335664, "grad_norm": 0.8654571175575256, "kl": 0.3221873342990875, "learning_rate": 1.9036663560535484e-06, "loss": 0.0129, "reward": 2.079622268676758, "reward_std": 1.5551748275756836, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3046221137046814, "rewards/reward_search_strategy": 0.2750000059604645, "step": 619 }, { "completion_length": 457.0, "epoch": 2.167832167832168, "grad_norm": 1.6892919540405273, "kl": 0.6353630423545837, "learning_rate": 1.895195261000831e-06, "loss": 0.0254, "reward": 0.9319507479667664, "reward_std": 1.268846035003662, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.2819507420063019, "rewards/reward_search_strategy": 0.15000000596046448, "step": 620 }, { "completion_length": 776.75, "epoch": 2.1713286713286712, "grad_norm": 2.699998140335083, "kl": 0.5228149890899658, "learning_rate": 1.8867315353029937e-06, "loss": 0.0209, "reward": 1.3579305410385132, "reward_std": 1.0180671215057373, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3579305112361908, "rewards/reward_search_strategy": 0.25, "step": 621 }, { "completion_length": 335.25, "epoch": 2.174825174825175, "grad_norm": 2.1976499557495117, "kl": 0.7445406913757324, "learning_rate": 1.8782752820878636e-06, "loss": 0.0298, "reward": 1.4963700771331787, "reward_std": 0.7110073566436768, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.4213700294494629, "rewards/reward_search_strategy": 0.07500000298023224, "step": 622 }, { "completion_length": 475.5, "epoch": 2.1783216783216783, "grad_norm": 1.735268473625183, "kl": 0.5818000435829163, "learning_rate": 1.8698266043922159e-06, "loss": 0.0233, "reward": 1.7498990297317505, "reward_std": 1.372759222984314, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4998989403247833, "rewards/reward_search_strategy": 0.25, "step": 623 }, { "completion_length": 504.0, "epoch": 2.1818181818181817, "grad_norm": 3.104727268218994, "kl": 0.8847209215164185, "learning_rate": 1.8613856051605242e-06, "loss": 0.0354, "reward": 3.2298035621643066, "reward_std": 0.9418405294418335, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4548037052154541, "rewards/reward_search_strategy": 0.15000000596046448, "step": 624 }, { "completion_length": 709.875, "epoch": 2.1853146853146854, "grad_norm": 0.9933962225914001, "kl": 0.35580018162727356, "learning_rate": 1.852952387243698e-06, "loss": 0.0142, "reward": 2.2982277870178223, "reward_std": 1.8821719884872437, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3982280194759369, "rewards/reward_search_strategy": 0.2750000059604645, "step": 625 }, { "completion_length": 329.625, "epoch": 2.1888111888111887, "grad_norm": 5.319577693939209, "kl": 3.290881872177124, "learning_rate": 1.8445270533978387e-06, "loss": 0.1316, "reward": 0.805007815361023, "reward_std": 0.9074010252952576, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.25500786304473877, "rewards/reward_search_strategy": 0.05000000074505806, "step": 626 }, { "completion_length": 699.5, "epoch": 2.1923076923076925, "grad_norm": 1.924681305885315, "kl": 1.104554533958435, "learning_rate": 1.836109706282978e-06, "loss": 0.0442, "reward": 1.0210304260253906, "reward_std": 1.3331161737442017, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.22103038430213928, "rewards/reward_search_strategy": 0.05000000074505806, "step": 627 }, { "completion_length": 568.25, "epoch": 2.195804195804196, "grad_norm": 2.6070520877838135, "kl": 0.5881224274635315, "learning_rate": 1.827700448461836e-06, "loss": 0.0235, "reward": 2.8544085025787354, "reward_std": 2.1881022453308105, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4044085443019867, "rewards/reward_search_strategy": 0.32500001788139343, "step": 628 }, { "completion_length": 1027.25, "epoch": 2.199300699300699, "grad_norm": 2.4598379135131836, "kl": 1.632110834121704, "learning_rate": 1.8192993823985643e-06, "loss": 0.0653, "reward": 0.9249999523162842, "reward_std": 2.6162948608398438, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.02500000037252903, "rewards/reward_search_strategy": 0.02500000037252903, "step": 629 }, { "completion_length": 627.0, "epoch": 2.202797202797203, "grad_norm": 2.1487743854522705, "kl": 1.5370110273361206, "learning_rate": 1.8109066104575023e-06, "loss": 0.0615, "reward": 1.460218906402588, "reward_std": 1.470046877861023, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.2602188289165497, "rewards/reward_search_strategy": 0.07500000298023224, "step": 630 }, { "completion_length": 869.375, "epoch": 2.2062937062937062, "grad_norm": 1.0303807258605957, "kl": 0.21061992645263672, "learning_rate": 1.8025222349019273e-06, "loss": 0.0084, "reward": 1.3355481624603271, "reward_std": 1.430586338043213, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.28554826974868774, "rewards/reward_search_strategy": 0.05000000074505806, "step": 631 }, { "completion_length": 728.625, "epoch": 2.20979020979021, "grad_norm": 0.9784128665924072, "kl": 0.28733620047569275, "learning_rate": 1.7941463578928088e-06, "loss": 0.0115, "reward": 1.639373779296875, "reward_std": 1.9725110530853271, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2893737554550171, "rewards/reward_search_strategy": 0.22500000894069672, "step": 632 }, { "completion_length": 781.375, "epoch": 2.2132867132867133, "grad_norm": 1.5272443294525146, "kl": 0.4059949517250061, "learning_rate": 1.7857790814875665e-06, "loss": 0.0162, "reward": 2.185642719268799, "reward_std": 1.6133114099502563, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2856428325176239, "rewards/reward_search_strategy": 0.15000000596046448, "step": 633 }, { "completion_length": 876.75, "epoch": 2.2167832167832167, "grad_norm": 3.8766872882843018, "kl": 1.414270281791687, "learning_rate": 1.7774205076388207e-06, "loss": 0.0566, "reward": 1.3451478481292725, "reward_std": 2.013153314590454, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.17014791071414948, "rewards/reward_search_strategy": 0.05000000074505806, "step": 634 }, { "completion_length": 698.0, "epoch": 2.2202797202797204, "grad_norm": 1.516858458518982, "kl": 0.39291489124298096, "learning_rate": 1.7690707381931585e-06, "loss": 0.0157, "reward": 0.7239600419998169, "reward_std": 1.0574333667755127, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.0989600345492363, "rewards/reward_search_strategy": 0.125, "step": 635 }, { "completion_length": 286.25, "epoch": 2.2237762237762237, "grad_norm": 1.2062504291534424, "kl": 0.4344251751899719, "learning_rate": 1.7607298748898844e-06, "loss": 0.0174, "reward": 1.9949274063110352, "reward_std": 1.525809645652771, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3449275493621826, "rewards/reward_search_strategy": 0.15000000596046448, "step": 636 }, { "completion_length": 398.125, "epoch": 2.227272727272727, "grad_norm": 3.9527926445007324, "kl": 0.7711962461471558, "learning_rate": 1.7523980193597837e-06, "loss": 0.0308, "reward": 1.9532475471496582, "reward_std": 1.5099598169326782, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.32824742794036865, "rewards/reward_search_strategy": 0.125, "step": 637 }, { "completion_length": 177.125, "epoch": 2.230769230769231, "grad_norm": 2.8524467945098877, "kl": 0.6512960195541382, "learning_rate": 1.744075273123889e-06, "loss": 0.0261, "reward": 2.0816469192504883, "reward_std": 1.550155520439148, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.256646990776062, "rewards/reward_search_strategy": 0.20000001788139343, "step": 638 }, { "completion_length": 833.25, "epoch": 2.234265734265734, "grad_norm": 0.5548487305641174, "kl": 0.31493040919303894, "learning_rate": 1.735761737592236e-06, "loss": 0.0126, "reward": 2.217541217803955, "reward_std": 1.338832974433899, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4675411880016327, "rewards/reward_search_strategy": 0.125, "step": 639 }, { "completion_length": 571.25, "epoch": 2.237762237762238, "grad_norm": 3.3657655715942383, "kl": 1.0643417835235596, "learning_rate": 1.7274575140626318e-06, "loss": 0.0426, "reward": 1.813431978225708, "reward_std": 2.622892141342163, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.11343199759721756, "rewards/reward_search_strategy": 0.07500000298023224, "step": 640 }, { "completion_length": 188.5, "epoch": 2.2412587412587412, "grad_norm": 2.92663311958313, "kl": 1.4145941734313965, "learning_rate": 1.7191627037194187e-06, "loss": 0.0566, "reward": 1.5885727405548096, "reward_std": 1.5621074438095093, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2385728508234024, "rewards/reward_search_strategy": 0.10000000149011612, "step": 641 }, { "completion_length": 718.25, "epoch": 2.2447552447552446, "grad_norm": 1.4574460983276367, "kl": 0.43313488364219666, "learning_rate": 1.7108774076322443e-06, "loss": 0.0173, "reward": 2.001852035522461, "reward_std": 1.1423044204711914, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3768518567085266, "rewards/reward_search_strategy": 0.125, "step": 642 }, { "completion_length": 442.25, "epoch": 2.2482517482517483, "grad_norm": 1.1469643115997314, "kl": 0.6819137930870056, "learning_rate": 1.702601726754825e-06, "loss": 0.0273, "reward": 1.0776560306549072, "reward_std": 1.2006266117095947, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.22765597701072693, "rewards/reward_search_strategy": 0.10000000149011612, "step": 643 }, { "completion_length": 386.125, "epoch": 2.2517482517482517, "grad_norm": 3.1043453216552734, "kl": 1.2604275941848755, "learning_rate": 1.6943357619237227e-06, "loss": 0.0504, "reward": 1.6263604164123535, "reward_std": 1.5263493061065674, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.1513603925704956, "rewards/reward_search_strategy": 0.10000000149011612, "step": 644 }, { "completion_length": 394.75, "epoch": 2.2552447552447554, "grad_norm": 2.7489726543426514, "kl": 0.6823881268501282, "learning_rate": 1.686079613857109e-06, "loss": 0.0273, "reward": 2.122854232788086, "reward_std": 1.7154755592346191, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2478543519973755, "rewards/reward_search_strategy": 0.125, "step": 645 }, { "completion_length": 578.25, "epoch": 2.2587412587412588, "grad_norm": 1.5313423871994019, "kl": 0.47184064984321594, "learning_rate": 1.677833383153542e-06, "loss": 0.0189, "reward": 2.3924877643585205, "reward_std": 1.742182970046997, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4424876868724823, "rewards/reward_search_strategy": 0.20000001788139343, "step": 646 }, { "completion_length": 731.125, "epoch": 2.262237762237762, "grad_norm": 0.7179574370384216, "kl": 0.28321006894111633, "learning_rate": 1.6695971702907425e-06, "loss": 0.0113, "reward": 3.0394439697265625, "reward_std": 1.496739149093628, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5394439697265625, "rewards/reward_search_strategy": 0.125, "step": 647 }, { "completion_length": 424.0, "epoch": 2.265734265734266, "grad_norm": 1.839329481124878, "kl": 0.757418155670166, "learning_rate": 1.661371075624363e-06, "loss": 0.0303, "reward": 1.5853004455566406, "reward_std": 1.868477463722229, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2353004813194275, "rewards/reward_search_strategy": 0.10000000149011612, "step": 648 }, { "completion_length": 168.125, "epoch": 2.269230769230769, "grad_norm": 3.7892937660217285, "kl": 0.8482879996299744, "learning_rate": 1.6531551993867717e-06, "loss": 0.0339, "reward": 2.2620646953582764, "reward_std": 1.6340628862380981, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.0870647132396698, "rewards/reward_search_strategy": 0.17500001192092896, "step": 649 }, { "completion_length": 342.875, "epoch": 2.2727272727272725, "grad_norm": 1.6862787008285522, "kl": 0.7084638476371765, "learning_rate": 1.6449496416858285e-06, "loss": 0.0283, "reward": 2.0570688247680664, "reward_std": 1.0292710065841675, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3320687413215637, "rewards/reward_search_strategy": 0.22499999403953552, "step": 650 }, { "completion_length": 718.875, "epoch": 2.2762237762237763, "grad_norm": 1.3718791007995605, "kl": 0.31645065546035767, "learning_rate": 1.6367545025036634e-06, "loss": 0.0127, "reward": 2.4409759044647217, "reward_std": 1.5008270740509033, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5409759283065796, "rewards/reward_search_strategy": 0.15000000596046448, "step": 651 }, { "completion_length": 1004.75, "epoch": 2.2797202797202796, "grad_norm": 17.508697509765625, "kl": 6.065591335296631, "learning_rate": 1.6285698816954626e-06, "loss": 0.2426, "reward": 1.4252126216888428, "reward_std": 1.1526154279708862, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.500212550163269, "rewards/reward_search_strategy": 0.17500001192092896, "step": 652 }, { "completion_length": 498.875, "epoch": 2.2832167832167833, "grad_norm": 2.261836051940918, "kl": 0.8655574917793274, "learning_rate": 1.6203958789882457e-06, "loss": 0.0346, "reward": 1.159027338027954, "reward_std": 1.3629136085510254, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3090273141860962, "rewards/reward_search_strategy": 0.10000000149011612, "step": 653 }, { "completion_length": 168.5, "epoch": 2.2867132867132867, "grad_norm": 4.639626979827881, "kl": 1.1974122524261475, "learning_rate": 1.612232593979658e-06, "loss": 0.0479, "reward": 2.889040470123291, "reward_std": 1.7135334014892578, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.38904064893722534, "rewards/reward_search_strategy": 0.125, "step": 654 }, { "completion_length": 303.625, "epoch": 2.29020979020979, "grad_norm": 4.122119426727295, "kl": 1.0288439989089966, "learning_rate": 1.6040801261367494e-06, "loss": 0.0412, "reward": 2.0587358474731445, "reward_std": 1.550704836845398, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.20873577892780304, "rewards/reward_search_strategy": 0.10000000149011612, "step": 655 }, { "completion_length": 671.75, "epoch": 2.2937062937062938, "grad_norm": 1.04127836227417, "kl": 0.37351641058921814, "learning_rate": 1.5959385747947697e-06, "loss": 0.0149, "reward": 2.8860764503479004, "reward_std": 0.7615900039672852, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.43607643246650696, "rewards/reward_search_strategy": 0.20000000298023224, "step": 656 }, { "completion_length": 924.0, "epoch": 2.297202797202797, "grad_norm": 1.1038192510604858, "kl": 0.2962590754032135, "learning_rate": 1.5878080391559507e-06, "loss": 0.0119, "reward": 1.5142022371292114, "reward_std": 1.436745047569275, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2892022132873535, "rewards/reward_search_strategy": 0.10000000149011612, "step": 657 }, { "completion_length": 178.625, "epoch": 2.300699300699301, "grad_norm": 1.9986516237258911, "kl": 0.6878660917282104, "learning_rate": 1.5796886182883053e-06, "loss": 0.0275, "reward": 2.1207919120788574, "reward_std": 0.9549412727355957, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4957919716835022, "rewards/reward_search_strategy": 0.125, "step": 658 }, { "completion_length": 414.5, "epoch": 2.304195804195804, "grad_norm": 2.3835370540618896, "kl": 0.5394484996795654, "learning_rate": 1.5715804111244138e-06, "loss": 0.0216, "reward": 2.5037827491760254, "reward_std": 1.56477689743042, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.27878281474113464, "rewards/reward_search_strategy": 0.10000000149011612, "step": 659 }, { "completion_length": 468.25, "epoch": 2.3076923076923075, "grad_norm": 0.8914346098899841, "kl": 0.70941162109375, "learning_rate": 1.56348351646022e-06, "loss": 0.0284, "reward": 2.8547205924987793, "reward_std": 2.1831724643707275, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.40472057461738586, "rewards/reward_search_strategy": 0.20000000298023224, "step": 660 }, { "completion_length": 198.5, "epoch": 2.3111888111888113, "grad_norm": 2.044799327850342, "kl": 0.5312392115592957, "learning_rate": 1.5553980329538326e-06, "loss": 0.0212, "reward": 2.9068050384521484, "reward_std": 1.2676535844802856, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5068050622940063, "rewards/reward_search_strategy": 0.15000000596046448, "step": 661 }, { "completion_length": 477.0, "epoch": 2.3146853146853146, "grad_norm": 0.9836262464523315, "kl": 0.42974257469177246, "learning_rate": 1.547324059124315e-06, "loss": 0.0172, "reward": 3.0365781784057617, "reward_std": 0.8693631291389465, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3365783393383026, "rewards/reward_search_strategy": 0.20000000298023224, "step": 662 }, { "completion_length": 394.0, "epoch": 2.3181818181818183, "grad_norm": 2.381326198577881, "kl": 0.9618586897850037, "learning_rate": 1.539261693350491e-06, "loss": 0.0385, "reward": 1.7162296772003174, "reward_std": 1.4073381423950195, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.11622962355613708, "rewards/reward_search_strategy": 0.10000000149011612, "step": 663 }, { "completion_length": 415.75, "epoch": 2.3216783216783217, "grad_norm": 2.788940191268921, "kl": 1.5708866119384766, "learning_rate": 1.5312110338697427e-06, "loss": 0.0628, "reward": 1.9888486862182617, "reward_std": 2.028597116470337, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.13884879648685455, "rewards/reward_search_strategy": 0.10000000149011612, "step": 664 }, { "completion_length": 476.0, "epoch": 2.325174825174825, "grad_norm": 0.7510575652122498, "kl": 0.3715492784976959, "learning_rate": 1.5231721787768162e-06, "loss": 0.0149, "reward": 2.291874885559082, "reward_std": 1.4457261562347412, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.31687483191490173, "rewards/reward_search_strategy": 0.10000000149011612, "step": 665 }, { "completion_length": 456.125, "epoch": 2.3286713286713288, "grad_norm": 3.3632757663726807, "kl": 1.4414145946502686, "learning_rate": 1.5151452260226224e-06, "loss": 0.0577, "reward": 2.4938864707946777, "reward_std": 2.5616908073425293, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.19388660788536072, "rewards/reward_search_strategy": 0.17500001192092896, "step": 666 }, { "completion_length": 467.125, "epoch": 2.332167832167832, "grad_norm": 1.0134179592132568, "kl": 0.31894758343696594, "learning_rate": 1.5071302734130488e-06, "loss": 0.0128, "reward": 2.0464019775390625, "reward_std": 1.3416796922683716, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.49640196561813354, "rewards/reward_search_strategy": 0.17499999701976776, "step": 667 }, { "completion_length": 763.875, "epoch": 2.335664335664336, "grad_norm": 1.411393404006958, "kl": 0.9449763298034668, "learning_rate": 1.4991274186077632e-06, "loss": 0.0378, "reward": 1.5500361919403076, "reward_std": 1.475958228111267, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.15003621578216553, "rewards/reward_search_strategy": 0.15000000596046448, "step": 668 }, { "completion_length": 671.5, "epoch": 2.339160839160839, "grad_norm": 1.0004576444625854, "kl": 0.4133380651473999, "learning_rate": 1.491136759119025e-06, "loss": 0.0165, "reward": 1.3330062627792358, "reward_std": 1.244538426399231, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.20800627768039703, "rewards/reward_search_strategy": 0.125, "step": 669 }, { "completion_length": 518.625, "epoch": 2.3426573426573425, "grad_norm": 1.836653470993042, "kl": 0.7970398664474487, "learning_rate": 1.4831583923105e-06, "loss": 0.0319, "reward": 1.3539539575576782, "reward_std": 1.3266041278839111, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.3289540112018585, "rewards/reward_search_strategy": 0.15000000596046448, "step": 670 }, { "completion_length": 421.5, "epoch": 2.3461538461538463, "grad_norm": 2.0048887729644775, "kl": 0.5317112803459167, "learning_rate": 1.4751924153960681e-06, "loss": 0.0213, "reward": 1.772258996963501, "reward_std": 1.371480107307434, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3972589075565338, "rewards/reward_search_strategy": 0.125, "step": 671 }, { "completion_length": 378.5, "epoch": 2.3496503496503496, "grad_norm": 1.0689804553985596, "kl": 0.20016981661319733, "learning_rate": 1.467238925438646e-06, "loss": 0.008, "reward": 2.407968759536743, "reward_std": 0.863514244556427, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1329687088727951, "rewards/reward_search_strategy": 0.15000000596046448, "step": 672 }, { "completion_length": 120.75, "epoch": 2.3531468531468533, "grad_norm": 2.696688652038574, "kl": 0.5237963795661926, "learning_rate": 1.4592980193489975e-06, "loss": 0.021, "reward": 3.172454357147217, "reward_std": 0.6612268686294556, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.27245447039604187, "rewards/reward_search_strategy": 0.15000000596046448, "step": 673 }, { "completion_length": 487.375, "epoch": 2.3566433566433567, "grad_norm": 0.7538911700248718, "kl": 0.31118044257164, "learning_rate": 1.4513697938845571e-06, "loss": 0.0124, "reward": 2.9248735904693604, "reward_std": 1.1403623819351196, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3998734951019287, "rewards/reward_search_strategy": 0.2750000059604645, "step": 674 }, { "completion_length": 193.75, "epoch": 2.36013986013986, "grad_norm": 2.599130630493164, "kl": 1.8222945928573608, "learning_rate": 1.443454345648252e-06, "loss": 0.0729, "reward": 1.829810380935669, "reward_std": 1.482524037361145, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2548103332519531, "rewards/reward_search_strategy": 0.07500000298023224, "step": 675 }, { "completion_length": 893.875, "epoch": 2.3636363636363638, "grad_norm": 1.1241508722305298, "kl": 0.28693926334381104, "learning_rate": 1.4355517710873184e-06, "loss": 0.0115, "reward": 1.5366313457489014, "reward_std": 1.5337618589401245, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.28663140535354614, "rewards/reward_search_strategy": 0.125, "step": 676 }, { "completion_length": 890.5, "epoch": 2.367132867132867, "grad_norm": 1.7457287311553955, "kl": 1.2755091190338135, "learning_rate": 1.4276621664921358e-06, "loss": 0.051, "reward": 2.140186071395874, "reward_std": 2.216279983520508, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.39018601179122925, "rewards/reward_search_strategy": 0.125, "step": 677 }, { "completion_length": 697.0, "epoch": 2.370629370629371, "grad_norm": 1.0398144721984863, "kl": 0.2402566373348236, "learning_rate": 1.419785627995044e-06, "loss": 0.0096, "reward": 1.8339523077011108, "reward_std": 1.089126706123352, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3089522123336792, "rewards/reward_search_strategy": 0.2750000059604645, "step": 678 }, { "completion_length": 821.25, "epoch": 2.374125874125874, "grad_norm": 0.8711546063423157, "kl": 0.35483434796333313, "learning_rate": 1.4119222515691817e-06, "loss": 0.0142, "reward": 2.207292079925537, "reward_std": 1.5479539632797241, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3322921395301819, "rewards/reward_search_strategy": 0.125, "step": 679 }, { "completion_length": 1002.25, "epoch": 2.3776223776223775, "grad_norm": 0.890271782875061, "kl": 0.2141900360584259, "learning_rate": 1.4040721330273063e-06, "loss": 0.0086, "reward": 1.7799444198608398, "reward_std": 1.9324290752410889, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30494439601898193, "rewards/reward_search_strategy": 0.10000000149011612, "step": 680 }, { "completion_length": 168.5, "epoch": 2.3811188811188813, "grad_norm": 2.849769115447998, "kl": 1.1200069189071655, "learning_rate": 1.3962353680206372e-06, "loss": 0.0448, "reward": 1.9590483903884888, "reward_std": 1.711218237876892, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.28404849767684937, "rewards/reward_search_strategy": 0.17500001192092896, "step": 681 }, { "completion_length": 1380.375, "epoch": 2.3846153846153846, "grad_norm": 0.6261701583862305, "kl": 0.26690512895584106, "learning_rate": 1.388412052037682e-06, "loss": 0.0107, "reward": 0.6567364931106567, "reward_std": 1.0645313262939453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1817365139722824, "rewards/reward_search_strategy": 0.10000000149011612, "step": 682 }, { "completion_length": 129.25, "epoch": 2.3881118881118883, "grad_norm": 4.146401882171631, "kl": 3.074443817138672, "learning_rate": 1.380602280403076e-06, "loss": 0.123, "reward": 1.4597092866897583, "reward_std": 1.3509553670883179, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.25970932841300964, "rewards/reward_search_strategy": 0.20000000298023224, "step": 683 }, { "completion_length": 202.625, "epoch": 2.3916083916083917, "grad_norm": 1.3240187168121338, "kl": 0.7654680609703064, "learning_rate": 1.3728061482764238e-06, "loss": 0.0306, "reward": 1.6409448385238647, "reward_std": 1.162649393081665, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4409448504447937, "rewards/reward_search_strategy": 0.20000000298023224, "step": 684 }, { "completion_length": 461.375, "epoch": 2.395104895104895, "grad_norm": 2.429736852645874, "kl": 0.7713171243667603, "learning_rate": 1.3650237506511333e-06, "loss": 0.0309, "reward": 2.3407061100006104, "reward_std": 1.5831953287124634, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3157060444355011, "rewards/reward_search_strategy": 0.15000000596046448, "step": 685 }, { "completion_length": 449.125, "epoch": 2.3986013986013988, "grad_norm": 0.6447603106498718, "kl": 0.31759539246559143, "learning_rate": 1.3572551823532654e-06, "loss": 0.0127, "reward": 2.9339704513549805, "reward_std": 1.0821915864944458, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3839705288410187, "rewards/reward_search_strategy": 0.17499999701976776, "step": 686 }, { "completion_length": 889.75, "epoch": 2.402097902097902, "grad_norm": 1.8892614841461182, "kl": 0.7984387874603271, "learning_rate": 1.349500538040371e-06, "loss": 0.0319, "reward": 2.00244140625, "reward_std": 1.3795653581619263, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3024413287639618, "rewards/reward_search_strategy": 0.20000000298023224, "step": 687 }, { "completion_length": 489.125, "epoch": 2.4055944055944054, "grad_norm": 0.8772282600402832, "kl": 0.2710227370262146, "learning_rate": 1.3417599122003464e-06, "loss": 0.0108, "reward": 2.386509418487549, "reward_std": 1.5961159467697144, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.21150948107242584, "rewards/reward_search_strategy": 0.17500001192092896, "step": 688 }, { "completion_length": 254.25, "epoch": 2.409090909090909, "grad_norm": 0.9712724089622498, "kl": 0.3853650689125061, "learning_rate": 1.3340333991502723e-06, "loss": 0.0154, "reward": 3.6746137142181396, "reward_std": 0.21691659092903137, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.49961355328559875, "rewards/reward_search_strategy": 0.17499999701976776, "step": 689 }, { "completion_length": 759.625, "epoch": 2.4125874125874125, "grad_norm": 0.6003175973892212, "kl": 0.22895187139511108, "learning_rate": 1.3263210930352737e-06, "loss": 0.0092, "reward": 2.0592947006225586, "reward_std": 1.4781805276870728, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.15929487347602844, "rewards/reward_search_strategy": 0.15000000596046448, "step": 690 }, { "completion_length": 421.625, "epoch": 2.4160839160839163, "grad_norm": 1.1077606678009033, "kl": 0.45024362206459045, "learning_rate": 1.3186230878273654e-06, "loss": 0.018, "reward": 1.6011184453964233, "reward_std": 1.2054197788238525, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2761184573173523, "rewards/reward_search_strategy": 0.07500000298023224, "step": 691 }, { "completion_length": 388.5, "epoch": 2.4195804195804196, "grad_norm": 3.1617443561553955, "kl": 0.7193136811256409, "learning_rate": 1.3109394773243117e-06, "loss": 0.0288, "reward": 2.1042187213897705, "reward_std": 1.3990535736083984, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1542186737060547, "rewards/reward_search_strategy": 0.07500000298023224, "step": 692 }, { "completion_length": 1230.25, "epoch": 2.423076923076923, "grad_norm": 0.7175737023353577, "kl": 0.23203320801258087, "learning_rate": 1.3032703551484832e-06, "loss": 0.0093, "reward": 1.9773597717285156, "reward_std": 1.4208744764328003, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4023597836494446, "rewards/reward_search_strategy": 0.20000000298023224, "step": 693 }, { "completion_length": 214.125, "epoch": 2.4265734265734267, "grad_norm": 0.7826201319694519, "kl": 0.3771539032459259, "learning_rate": 1.2956158147457116e-06, "loss": 0.0151, "reward": 2.5350708961486816, "reward_std": 1.1504623889923096, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.28507089614868164, "rewards/reward_search_strategy": 0.125, "step": 694 }, { "completion_length": 696.125, "epoch": 2.43006993006993, "grad_norm": 0.7204045057296753, "kl": 0.314938485622406, "learning_rate": 1.2879759493841577e-06, "loss": 0.0126, "reward": 2.8791046142578125, "reward_std": 1.4203031063079834, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3541046977043152, "rewards/reward_search_strategy": 0.15000000596046448, "step": 695 }, { "completion_length": 588.0, "epoch": 2.4335664335664333, "grad_norm": 0.778933584690094, "kl": 0.2978561520576477, "learning_rate": 1.280350852153168e-06, "loss": 0.0119, "reward": 3.2200334072113037, "reward_std": 0.9710680842399597, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.54503333568573, "rewards/reward_search_strategy": 0.17500001192092896, "step": 696 }, { "completion_length": 184.25, "epoch": 2.437062937062937, "grad_norm": 9.920829772949219, "kl": 0.615140974521637, "learning_rate": 1.272740615962148e-06, "loss": 0.0246, "reward": 2.8446221351623535, "reward_std": 1.2973819971084595, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.2946220636367798, "rewards/reward_search_strategy": 0.17500001192092896, "step": 697 }, { "completion_length": 201.625, "epoch": 2.4405594405594404, "grad_norm": 1.735718011856079, "kl": 0.6824493408203125, "learning_rate": 1.2651453335394232e-06, "loss": 0.0273, "reward": 2.5958385467529297, "reward_std": 0.21638303995132446, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.4208384156227112, "rewards/reward_search_strategy": 0.17499999701976776, "step": 698 }, { "completion_length": 711.25, "epoch": 2.444055944055944, "grad_norm": 0.6248417496681213, "kl": 0.2720755934715271, "learning_rate": 1.2575650974311118e-06, "loss": 0.0109, "reward": 2.844130039215088, "reward_std": 1.4015597105026245, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5191301107406616, "rewards/reward_search_strategy": 0.20000000298023224, "step": 699 }, { "completion_length": 466.75, "epoch": 2.4475524475524475, "grad_norm": 0.7954990267753601, "kl": 0.3459298610687256, "learning_rate": 1.2500000000000007e-06, "loss": 0.0138, "reward": 1.7812418937683105, "reward_std": 1.0424985885620117, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.13124188780784607, "rewards/reward_search_strategy": 0.15000000596046448, "step": 700 }, { "completion_length": 1009.875, "epoch": 2.451048951048951, "grad_norm": 0.7035873532295227, "kl": 0.24958501756191254, "learning_rate": 1.2424501334244124e-06, "loss": 0.01, "reward": 2.0463454723358154, "reward_std": 1.7158129215240479, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3963455259799957, "rewards/reward_search_strategy": 0.15000000596046448, "step": 701 }, { "completion_length": 180.25, "epoch": 2.4545454545454546, "grad_norm": 2.6211822032928467, "kl": 0.4796156585216522, "learning_rate": 1.234915589697091e-06, "loss": 0.0192, "reward": 2.8570384979248047, "reward_std": 0.7829788327217102, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.2570386230945587, "rewards/reward_search_strategy": 0.22500000894069672, "step": 702 }, { "completion_length": 420.0, "epoch": 2.458041958041958, "grad_norm": 0.5080879926681519, "kl": 0.3278014659881592, "learning_rate": 1.2273964606240718e-06, "loss": 0.0131, "reward": 3.5901222229003906, "reward_std": 0.6359260678291321, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.46512237191200256, "rewards/reward_search_strategy": 0.375, "step": 703 }, { "completion_length": 190.125, "epoch": 2.4615384615384617, "grad_norm": 1.098313570022583, "kl": 0.3493667542934418, "learning_rate": 1.2198928378235717e-06, "loss": 0.014, "reward": 3.258561611175537, "reward_std": 0.7251243591308594, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.30856141448020935, "rewards/reward_search_strategy": 0.20000001788139343, "step": 704 }, { "completion_length": 212.625, "epoch": 2.465034965034965, "grad_norm": 1.1919142007827759, "kl": 0.4384557902812958, "learning_rate": 1.2124048127248644e-06, "loss": 0.0175, "reward": 2.9238104820251465, "reward_std": 1.324345588684082, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3738102316856384, "rewards/reward_search_strategy": 0.17500001192092896, "step": 705 }, { "completion_length": 204.125, "epoch": 2.4685314685314683, "grad_norm": 1.9637373685836792, "kl": 0.5567646622657776, "learning_rate": 1.204932476567175e-06, "loss": 0.0223, "reward": 3.576674222946167, "reward_std": 1.737920880317688, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.1266743242740631, "rewards/reward_search_strategy": 0.20000000298023224, "step": 706 }, { "completion_length": 428.5, "epoch": 2.472027972027972, "grad_norm": 6.761178493499756, "kl": 4.387081146240234, "learning_rate": 1.19747592039856e-06, "loss": 0.1755, "reward": 1.8134260177612305, "reward_std": 1.3668913841247559, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.21342593431472778, "rewards/reward_search_strategy": 0.10000000149011612, "step": 707 }, { "completion_length": 507.375, "epoch": 2.4755244755244754, "grad_norm": 5.746307373046875, "kl": 1.5714839696884155, "learning_rate": 1.1900352350748026e-06, "loss": 0.0629, "reward": 1.7701107263565063, "reward_std": 1.0954716205596924, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39511069655418396, "rewards/reward_search_strategy": 0.125, "step": 708 }, { "completion_length": 626.125, "epoch": 2.479020979020979, "grad_norm": 0.5907007455825806, "kl": 0.3258489966392517, "learning_rate": 1.1826105112583061e-06, "loss": 0.013, "reward": 3.2026278972625732, "reward_std": 0.7593327760696411, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.22762778401374817, "rewards/reward_search_strategy": 0.22499999403953552, "step": 709 }, { "completion_length": 454.625, "epoch": 2.4825174825174825, "grad_norm": 0.9080464839935303, "kl": 0.25247493386268616, "learning_rate": 1.1752018394169882e-06, "loss": 0.0101, "reward": 3.2749321460723877, "reward_std": 0.690960705280304, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.29993218183517456, "rewards/reward_search_strategy": 0.22499999403953552, "step": 710 }, { "completion_length": 984.125, "epoch": 2.486013986013986, "grad_norm": 0.5310540199279785, "kl": 0.18524622917175293, "learning_rate": 1.1678093098231748e-06, "loss": 0.0074, "reward": 2.905205488204956, "reward_std": 1.5567431449890137, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40520551800727844, "rewards/reward_search_strategy": 0.25, "step": 711 }, { "completion_length": 408.0, "epoch": 2.4895104895104896, "grad_norm": 0.9158464074134827, "kl": 0.2190803438425064, "learning_rate": 1.160433012552508e-06, "loss": 0.0088, "reward": 2.580432415008545, "reward_std": 0.8634560108184814, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4554321765899658, "rewards/reward_search_strategy": 0.25, "step": 712 }, { "completion_length": 353.375, "epoch": 2.493006993006993, "grad_norm": 3.563263416290283, "kl": 0.7851734161376953, "learning_rate": 1.1530730374828422e-06, "loss": 0.0314, "reward": 2.158771276473999, "reward_std": 1.2103830575942993, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.1087711751461029, "rewards/reward_search_strategy": 0.17500001192092896, "step": 713 }, { "completion_length": 379.625, "epoch": 2.4965034965034967, "grad_norm": 2.571286201477051, "kl": 0.4004494249820709, "learning_rate": 1.1457294742931508e-06, "loss": 0.016, "reward": 2.6059951782226562, "reward_std": 1.6195003986358643, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.45599544048309326, "rewards/reward_search_strategy": 0.2750000059604645, "step": 714 }, { "completion_length": 474.0, "epoch": 2.5, "grad_norm": 0.558914303779602, "kl": 0.26106932759284973, "learning_rate": 1.1384024124624324e-06, "loss": 0.0104, "reward": 2.2070064544677734, "reward_std": 0.6060339212417603, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.23200632631778717, "rewards/reward_search_strategy": 0.22499999403953552, "step": 715 }, { "completion_length": 443.125, "epoch": 2.5034965034965033, "grad_norm": 1.3880614042282104, "kl": 0.2877422869205475, "learning_rate": 1.1310919412686248e-06, "loss": 0.0115, "reward": 2.5891804695129395, "reward_std": 1.560691237449646, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.21418024599552155, "rewards/reward_search_strategy": 0.125, "step": 716 }, { "completion_length": 619.375, "epoch": 2.506993006993007, "grad_norm": 0.6024248600006104, "kl": 0.2830110192298889, "learning_rate": 1.1237981497875112e-06, "loss": 0.0113, "reward": 3.431725025177002, "reward_std": 0.8502019047737122, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4817250370979309, "rewards/reward_search_strategy": 0.20000001788139343, "step": 717 }, { "completion_length": 560.125, "epoch": 2.5104895104895104, "grad_norm": 0.35110989212989807, "kl": 0.19600240886211395, "learning_rate": 1.11652112689164e-06, "loss": 0.0078, "reward": 3.0117030143737793, "reward_std": 1.6941925287246704, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3367030918598175, "rewards/reward_search_strategy": 0.17500001192092896, "step": 718 }, { "completion_length": 513.5, "epoch": 2.513986013986014, "grad_norm": 1.087957739830017, "kl": 0.3358318507671356, "learning_rate": 1.109260961249238e-06, "loss": 0.0134, "reward": 2.3795523643493652, "reward_std": 1.3459433317184448, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3795522451400757, "rewards/reward_search_strategy": 0.25, "step": 719 }, { "completion_length": 152.5, "epoch": 2.5174825174825175, "grad_norm": 5.490536212921143, "kl": 0.859663188457489, "learning_rate": 1.1020177413231334e-06, "loss": 0.0344, "reward": 3.666757822036743, "reward_std": 1.9958460330963135, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.41675788164138794, "rewards/reward_search_strategy": 0.125, "step": 720 }, { "completion_length": 502.125, "epoch": 2.520979020979021, "grad_norm": 0.45525604486465454, "kl": 0.17627465724945068, "learning_rate": 1.0947915553696742e-06, "loss": 0.0071, "reward": 2.1229989528656006, "reward_std": 0.5150920748710632, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.14799894392490387, "rewards/reward_search_strategy": 0.22500000894069672, "step": 721 }, { "completion_length": 196.125, "epoch": 2.5244755244755246, "grad_norm": 3.8241090774536133, "kl": 0.6243429183959961, "learning_rate": 1.0875824914376555e-06, "loss": 0.025, "reward": 3.5314383506774902, "reward_std": 1.662335991859436, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.08143842220306396, "rewards/reward_search_strategy": 0.20000001788139343, "step": 722 }, { "completion_length": 841.75, "epoch": 2.527972027972028, "grad_norm": 0.7508696913719177, "kl": 0.1930091381072998, "learning_rate": 1.0803906373672477e-06, "loss": 0.0077, "reward": 1.3541213274002075, "reward_std": 1.1997473239898682, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.45412132143974304, "rewards/reward_search_strategy": 0.15000000596046448, "step": 723 }, { "completion_length": 250.5, "epoch": 2.5314685314685317, "grad_norm": 5.6916422843933105, "kl": 0.3887576460838318, "learning_rate": 1.073216080788921e-06, "loss": 0.0156, "reward": 2.7577133178710938, "reward_std": 1.303903341293335, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.38271328806877136, "rewards/reward_search_strategy": 0.25, "step": 724 }, { "completion_length": 410.25, "epoch": 2.534965034965035, "grad_norm": 5.613461017608643, "kl": 1.09822416305542, "learning_rate": 1.0660589091223854e-06, "loss": 0.0439, "reward": 1.9298245906829834, "reward_std": 1.4367464780807495, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.25482451915740967, "rewards/reward_search_strategy": 0.17499999701976776, "step": 725 }, { "completion_length": 369.625, "epoch": 2.5384615384615383, "grad_norm": 3.449610471725464, "kl": 0.4649573862552643, "learning_rate": 1.0589192095755172e-06, "loss": 0.0186, "reward": 2.935774087905884, "reward_std": 1.777270793914795, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.23577404022216797, "rewards/reward_search_strategy": 0.20000000298023224, "step": 726 }, { "completion_length": 445.875, "epoch": 2.541958041958042, "grad_norm": 1.978943109512329, "kl": 3.9132683277130127, "learning_rate": 1.0517970691433035e-06, "loss": 0.1565, "reward": 2.3552181720733643, "reward_std": 1.6439377069473267, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.30521807074546814, "rewards/reward_search_strategy": 0.17500001192092896, "step": 727 }, { "completion_length": 314.0, "epoch": 2.5454545454545454, "grad_norm": 0.800722062587738, "kl": 0.34331655502319336, "learning_rate": 1.0446925746067768e-06, "loss": 0.0137, "reward": 3.17592716217041, "reward_std": 0.8327671885490417, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3259273171424866, "rewards/reward_search_strategy": 0.22500000894069672, "step": 728 }, { "completion_length": 202.625, "epoch": 2.548951048951049, "grad_norm": 1.2761588096618652, "kl": 0.47718727588653564, "learning_rate": 1.0376058125319614e-06, "loss": 0.0191, "reward": 3.906193733215332, "reward_std": 1.6910558938980103, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.33119359612464905, "rewards/reward_search_strategy": 0.20000000298023224, "step": 729 }, { "completion_length": 409.5, "epoch": 2.5524475524475525, "grad_norm": 0.8657374382019043, "kl": 0.22751927375793457, "learning_rate": 1.0305368692688175e-06, "loss": 0.0091, "reward": 2.2246451377868652, "reward_std": 1.362064242362976, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1996452510356903, "rewards/reward_search_strategy": 0.15000000596046448, "step": 730 }, { "completion_length": 223.875, "epoch": 2.555944055944056, "grad_norm": 3.785691261291504, "kl": 0.5875170230865479, "learning_rate": 1.0234858309501864e-06, "loss": 0.0235, "reward": 2.56313157081604, "reward_std": 1.660637617111206, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3881314694881439, "rewards/reward_search_strategy": 0.17500001192092896, "step": 731 }, { "completion_length": 218.875, "epoch": 2.5594405594405596, "grad_norm": 0.7490137815475464, "kl": 0.369142085313797, "learning_rate": 1.0164527834907468e-06, "loss": 0.0148, "reward": 2.558255672454834, "reward_std": 0.9317966103553772, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.20825548470020294, "rewards/reward_search_strategy": 0.22500000894069672, "step": 732 }, { "completion_length": 223.0, "epoch": 2.562937062937063, "grad_norm": 1.1031575202941895, "kl": 0.2784157395362854, "learning_rate": 1.0094378125859602e-06, "loss": 0.0111, "reward": 2.956556797027588, "reward_std": 0.8614805340766907, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.33155685663223267, "rewards/reward_search_strategy": 0.125, "step": 733 }, { "completion_length": 157.375, "epoch": 2.5664335664335667, "grad_norm": 1.3781330585479736, "kl": 0.3153168261051178, "learning_rate": 1.0024410037110358e-06, "loss": 0.0126, "reward": 2.4788851737976074, "reward_std": 0.7455314993858337, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4038851261138916, "rewards/reward_search_strategy": 0.20000000298023224, "step": 734 }, { "completion_length": 228.125, "epoch": 2.56993006993007, "grad_norm": 10.229633331298828, "kl": 0.45400354266166687, "learning_rate": 9.95462442119879e-07, "loss": 0.0182, "reward": 2.805171012878418, "reward_std": 1.4362815618515015, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.20517095923423767, "rewards/reward_search_strategy": 0.22500000894069672, "step": 735 }, { "completion_length": 556.375, "epoch": 2.5734265734265733, "grad_norm": 6.25646448135376, "kl": 0.5308452844619751, "learning_rate": 9.88502212844063e-07, "loss": 0.0212, "reward": 2.1313388347625732, "reward_std": 1.305338978767395, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43133872747421265, "rewards/reward_search_strategy": 0.20000000298023224, "step": 736 }, { "completion_length": 431.875, "epoch": 2.5769230769230766, "grad_norm": 1.5778599977493286, "kl": 0.3882601857185364, "learning_rate": 9.815604006917839e-07, "loss": 0.0155, "reward": 3.1895384788513184, "reward_std": 2.0701403617858887, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.26453855633735657, "rewards/reward_search_strategy": 0.17500001192092896, "step": 737 }, { "completion_length": 435.125, "epoch": 2.5804195804195804, "grad_norm": 0.7020029425621033, "kl": 0.3426036536693573, "learning_rate": 9.746370902468311e-07, "loss": 0.0137, "reward": 2.656763792037964, "reward_std": 1.174736738204956, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1567637324333191, "rewards/reward_search_strategy": 0.125, "step": 738 }, { "completion_length": 207.25, "epoch": 2.583916083916084, "grad_norm": 5.512199401855469, "kl": 0.5406113266944885, "learning_rate": 9.677323658675594e-07, "loss": 0.0216, "reward": 3.278657913208008, "reward_std": 1.3534682989120483, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.40365782380104065, "rewards/reward_search_strategy": 0.25, "step": 739 }, { "completion_length": 211.0, "epoch": 2.5874125874125875, "grad_norm": 3.1115472316741943, "kl": 0.5154640078544617, "learning_rate": 9.608463116858544e-07, "loss": 0.0206, "reward": 2.590893268585205, "reward_std": 1.163812279701233, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.06589316576719284, "rewards/reward_search_strategy": 0.15000000596046448, "step": 740 }, { "completion_length": 248.25, "epoch": 2.590909090909091, "grad_norm": 2.7806103229522705, "kl": 0.8573938608169556, "learning_rate": 9.53979011606115e-07, "loss": 0.0343, "reward": 1.8006809949874878, "reward_std": 1.4302541017532349, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.10068107396364212, "rewards/reward_search_strategy": 0.20000001788139343, "step": 741 }, { "completion_length": 334.375, "epoch": 2.594405594405594, "grad_norm": 2.6499695777893066, "kl": 1.568938970565796, "learning_rate": 9.471305493042243e-07, "loss": 0.0628, "reward": 1.7304112911224365, "reward_std": 1.5871738195419312, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.08041128516197205, "rewards/reward_search_strategy": 0.15000000596046448, "step": 742 }, { "completion_length": 226.0, "epoch": 2.597902097902098, "grad_norm": 0.8820388317108154, "kl": 0.2680734097957611, "learning_rate": 9.403010082265351e-07, "loss": 0.0107, "reward": 3.0550928115844727, "reward_std": 0.75883549451828, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3800925612449646, "rewards/reward_search_strategy": 0.17499999701976776, "step": 743 }, { "completion_length": 395.125, "epoch": 2.6013986013986012, "grad_norm": 1.0020458698272705, "kl": 0.6606673002243042, "learning_rate": 9.334904715888496e-07, "loss": 0.0264, "reward": 2.4976449012756348, "reward_std": 1.1654491424560547, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.2476448118686676, "rewards/reward_search_strategy": 0.125, "step": 744 }, { "completion_length": 212.375, "epoch": 2.604895104895105, "grad_norm": 0.9672914743423462, "kl": 0.43554913997650146, "learning_rate": 9.266990223754069e-07, "loss": 0.0174, "reward": 2.692328453063965, "reward_std": 0.663333535194397, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.24232834577560425, "rewards/reward_search_strategy": 0.20000001788139343, "step": 745 }, { "completion_length": 146.5, "epoch": 2.6083916083916083, "grad_norm": 2.657318353652954, "kl": 0.53204745054245, "learning_rate": 9.199267433378728e-07, "loss": 0.0213, "reward": 3.8679394721984863, "reward_std": 2.174532413482666, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.24293950200080872, "rewards/reward_search_strategy": 0.125, "step": 746 }, { "completion_length": 465.375, "epoch": 2.6118881118881117, "grad_norm": 0.7675890326499939, "kl": 0.2839233875274658, "learning_rate": 9.131737169943314e-07, "loss": 0.0114, "reward": 3.4565646648406982, "reward_std": 0.8739888668060303, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.5565648078918457, "rewards/reward_search_strategy": 0.15000000596046448, "step": 747 }, { "completion_length": 374.375, "epoch": 2.6153846153846154, "grad_norm": 3.154127597808838, "kl": 1.1787596940994263, "learning_rate": 9.064400256282757e-07, "loss": 0.0472, "reward": 2.2147796154022217, "reward_std": 1.5428773164749146, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.0897795557975769, "rewards/reward_search_strategy": 0.125, "step": 748 }, { "completion_length": 261.0, "epoch": 2.6188811188811187, "grad_norm": 0.8691149353981018, "kl": 0.2806665003299713, "learning_rate": 8.99725751287611e-07, "loss": 0.0112, "reward": 3.3426637649536133, "reward_std": 0.21651490032672882, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.1426638960838318, "rewards/reward_search_strategy": 0.20000000298023224, "step": 749 }, { "completion_length": 199.75, "epoch": 2.6223776223776225, "grad_norm": 4.052029609680176, "kl": 0.5289004445075989, "learning_rate": 8.930309757836517e-07, "loss": 0.0212, "reward": 2.488175392150879, "reward_std": 1.7952224016189575, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4631752669811249, "rewards/reward_search_strategy": 0.15000000596046448, "step": 750 }, { "completion_length": 443.125, "epoch": 2.625874125874126, "grad_norm": 0.7876097559928894, "kl": 0.30925026535987854, "learning_rate": 8.863557806901233e-07, "loss": 0.0124, "reward": 1.4860601425170898, "reward_std": 1.2915945053100586, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.33606013655662537, "rewards/reward_search_strategy": 0.15000000596046448, "step": 751 }, { "completion_length": 417.125, "epoch": 2.629370629370629, "grad_norm": 0.8596819043159485, "kl": 0.23684144020080566, "learning_rate": 8.797002473421729e-07, "loss": 0.0095, "reward": 1.724004864692688, "reward_std": 1.1447079181671143, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3490048944950104, "rewards/reward_search_strategy": 0.125, "step": 752 }, { "completion_length": 185.0, "epoch": 2.632867132867133, "grad_norm": 1.505584478378296, "kl": 0.561983585357666, "learning_rate": 8.73064456835373e-07, "loss": 0.0225, "reward": 1.603018045425415, "reward_std": 1.2052562236785889, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.20301805436611176, "rewards/reward_search_strategy": 0.15000000596046448, "step": 753 }, { "completion_length": 440.375, "epoch": 2.6363636363636362, "grad_norm": 0.9618105888366699, "kl": 0.3907013237476349, "learning_rate": 8.664484900247363e-07, "loss": 0.0156, "reward": 2.312002182006836, "reward_std": 0.7122796773910522, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4120020270347595, "rewards/reward_search_strategy": 0.15000000596046448, "step": 754 }, { "completion_length": 423.375, "epoch": 2.63986013986014, "grad_norm": 4.322463512420654, "kl": 0.4382891058921814, "learning_rate": 8.598524275237321e-07, "loss": 0.0175, "reward": 2.537440776824951, "reward_std": 1.3807319402694702, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.26244068145751953, "rewards/reward_search_strategy": 0.15000000596046448, "step": 755 }, { "completion_length": 621.625, "epoch": 2.6433566433566433, "grad_norm": 1.4833965301513672, "kl": 0.3152078688144684, "learning_rate": 8.532763497032987e-07, "loss": 0.0126, "reward": 3.5618674755096436, "reward_std": 1.99989914894104, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.2868673503398895, "rewards/reward_search_strategy": 0.15000000596046448, "step": 756 }, { "completion_length": 198.875, "epoch": 2.6468531468531467, "grad_norm": 1.5622419118881226, "kl": 0.7662823796272278, "learning_rate": 8.467203366908708e-07, "loss": 0.0307, "reward": 3.2453174591064453, "reward_std": 0.4246501624584198, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.295317679643631, "rewards/reward_search_strategy": 0.20000001788139343, "step": 757 }, { "completion_length": 428.625, "epoch": 2.6503496503496504, "grad_norm": 1.6101847887039185, "kl": 0.3458474278450012, "learning_rate": 8.40184468369396e-07, "loss": 0.0138, "reward": 2.8005740642547607, "reward_std": 0.8033618927001953, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.2755739688873291, "rewards/reward_search_strategy": 0.15000000596046448, "step": 758 }, { "completion_length": 542.75, "epoch": 2.6538461538461537, "grad_norm": 3.1310296058654785, "kl": 0.7589699625968933, "learning_rate": 8.336688243763691e-07, "loss": 0.0304, "reward": 2.9324984550476074, "reward_std": 2.1960811614990234, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.2574983239173889, "rewards/reward_search_strategy": 0.17500001192092896, "step": 759 }, { "completion_length": 155.375, "epoch": 2.6573426573426575, "grad_norm": 2.1024746894836426, "kl": 0.46271345019340515, "learning_rate": 8.271734841028553e-07, "loss": 0.0185, "reward": 3.8447351455688477, "reward_std": 1.4537591934204102, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.09473513811826706, "rewards/reward_search_strategy": 0.25, "step": 760 }, { "completion_length": 200.25, "epoch": 2.660839160839161, "grad_norm": 2.331498384475708, "kl": 0.23558367788791656, "learning_rate": 8.206985266925249e-07, "loss": 0.0094, "reward": 2.9388587474823, "reward_std": 1.143480896949768, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5388587713241577, "rewards/reward_search_strategy": 0.15000000596046448, "step": 761 }, { "completion_length": 151.625, "epoch": 2.664335664335664, "grad_norm": 4.439377307891846, "kl": 0.4204064607620239, "learning_rate": 8.142440310406923e-07, "loss": 0.0168, "reward": 2.6536755561828613, "reward_std": 1.5174716711044312, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.25367552042007446, "rewards/reward_search_strategy": 0.15000000596046448, "step": 762 }, { "completion_length": 479.5, "epoch": 2.667832167832168, "grad_norm": 1.8708802461624146, "kl": 0.357788622379303, "learning_rate": 8.078100757933486e-07, "loss": 0.0143, "reward": 1.2356081008911133, "reward_std": 1.1825733184814453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.08560801297426224, "rewards/reward_search_strategy": 0.15000000596046448, "step": 763 }, { "completion_length": 722.0, "epoch": 2.6713286713286712, "grad_norm": 0.5913345217704773, "kl": 0.27452099323272705, "learning_rate": 8.013967393462094e-07, "loss": 0.011, "reward": 2.1991820335388184, "reward_std": 1.0677379369735718, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4991818070411682, "rewards/reward_search_strategy": 0.20000000298023224, "step": 764 }, { "completion_length": 235.375, "epoch": 2.674825174825175, "grad_norm": 1.0853415727615356, "kl": 0.44766783714294434, "learning_rate": 7.950040998437541e-07, "loss": 0.0179, "reward": 2.8691787719726562, "reward_std": 0.728480339050293, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3441789150238037, "rewards/reward_search_strategy": 0.15000000596046448, "step": 765 }, { "completion_length": 775.25, "epoch": 2.6783216783216783, "grad_norm": 0.7826525568962097, "kl": 0.21313609182834625, "learning_rate": 7.886322351782782e-07, "loss": 0.0085, "reward": 2.260554790496826, "reward_std": 1.4236441850662231, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.41055476665496826, "rewards/reward_search_strategy": 0.10000000149011612, "step": 766 }, { "completion_length": 344.375, "epoch": 2.6818181818181817, "grad_norm": 0.8898032903671265, "kl": 0.28934159874916077, "learning_rate": 7.822812229889429e-07, "loss": 0.0116, "reward": 3.2559313774108887, "reward_std": 0.8968936204910278, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.280931293964386, "rewards/reward_search_strategy": 0.4750000238418579, "step": 767 }, { "completion_length": 823.25, "epoch": 2.6853146853146854, "grad_norm": 2.481016159057617, "kl": 0.4265955686569214, "learning_rate": 7.759511406608255e-07, "loss": 0.0171, "reward": 2.4612226486206055, "reward_std": 1.4923436641693115, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43622246384620667, "rewards/reward_search_strategy": 0.15000000596046448, "step": 768 }, { "completion_length": 425.125, "epoch": 2.6888111888111887, "grad_norm": 1.47541081905365, "kl": 0.45022284984588623, "learning_rate": 7.696420653239834e-07, "loss": 0.018, "reward": 3.914667844772339, "reward_std": 2.1343414783477783, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4396677315235138, "rewards/reward_search_strategy": 0.22500000894069672, "step": 769 }, { "completion_length": 478.625, "epoch": 2.6923076923076925, "grad_norm": 0.788350522518158, "kl": 0.34727805852890015, "learning_rate": 7.633540738525066e-07, "loss": 0.0139, "reward": 3.1578547954559326, "reward_std": 0.9086012840270996, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.40785473585128784, "rewards/reward_search_strategy": 0.125, "step": 770 }, { "completion_length": 764.375, "epoch": 2.695804195804196, "grad_norm": 0.9074851274490356, "kl": 0.2522607147693634, "learning_rate": 7.57087242863589e-07, "loss": 0.0101, "reward": 2.462294578552246, "reward_std": 1.7639708518981934, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.28729456663131714, "rewards/reward_search_strategy": 0.17500001192092896, "step": 771 }, { "completion_length": 478.25, "epoch": 2.699300699300699, "grad_norm": 0.3205096423625946, "kl": 0.20822447538375854, "learning_rate": 7.508416487165862e-07, "loss": 0.0083, "reward": 2.753657817840576, "reward_std": 1.4829416275024414, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.0786578357219696, "rewards/reward_search_strategy": 0.17499999701976776, "step": 772 }, { "completion_length": 398.25, "epoch": 2.702797202797203, "grad_norm": 2.2809150218963623, "kl": 0.478731244802475, "learning_rate": 7.44617367512094e-07, "loss": 0.0191, "reward": 1.8494818210601807, "reward_std": 1.4902433156967163, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.1994817703962326, "rewards/reward_search_strategy": 0.15000000596046448, "step": 773 }, { "completion_length": 495.875, "epoch": 2.7062937062937062, "grad_norm": 0.8974342942237854, "kl": 0.2864784300327301, "learning_rate": 7.384144750910133e-07, "loss": 0.0115, "reward": 2.170168876647949, "reward_std": 0.5860690474510193, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.1701689064502716, "rewards/reward_search_strategy": 0.25, "step": 774 }, { "completion_length": 291.875, "epoch": 2.70979020979021, "grad_norm": 2.3842809200286865, "kl": 0.34134241938591003, "learning_rate": 7.322330470336314e-07, "loss": 0.0137, "reward": 2.422370433807373, "reward_std": 1.3260725736618042, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2473704218864441, "rewards/reward_search_strategy": 0.17499999701976776, "step": 775 }, { "completion_length": 433.75, "epoch": 2.7132867132867133, "grad_norm": 1.3799805641174316, "kl": 0.3572145104408264, "learning_rate": 7.260731586586983e-07, "loss": 0.0143, "reward": 3.9122109413146973, "reward_std": 2.410112142562866, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3622109889984131, "rewards/reward_search_strategy": 0.17499999701976776, "step": 776 }, { "completion_length": 120.625, "epoch": 2.7167832167832167, "grad_norm": 2.863346576690674, "kl": 0.5980196595191956, "learning_rate": 7.199348850225091e-07, "loss": 0.0239, "reward": 2.5049784183502197, "reward_std": 2.3617804050445557, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30497854948043823, "rewards/reward_search_strategy": 0.07500000298023224, "step": 777 }, { "completion_length": 285.375, "epoch": 2.7202797202797204, "grad_norm": 0.9683252573013306, "kl": 0.3256075978279114, "learning_rate": 7.138183009179922e-07, "loss": 0.013, "reward": 3.2380764484405518, "reward_std": 0.36426228284835815, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.13807635009288788, "rewards/reward_search_strategy": 0.22500000894069672, "step": 778 }, { "completion_length": 287.125, "epoch": 2.7237762237762237, "grad_norm": 1.8798741102218628, "kl": 0.39888471364974976, "learning_rate": 7.077234808737932e-07, "loss": 0.016, "reward": 3.775599956512451, "reward_std": 1.1065847873687744, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.25059974193573, "rewards/reward_search_strategy": 0.2750000059604645, "step": 779 }, { "completion_length": 238.125, "epoch": 2.7272727272727275, "grad_norm": 0.638646125793457, "kl": 0.35831624269485474, "learning_rate": 7.016504991533727e-07, "loss": 0.0143, "reward": 3.084535837173462, "reward_std": 0.37157776951789856, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.03453579917550087, "rewards/reward_search_strategy": 0.17499999701976776, "step": 780 }, { "completion_length": 209.5, "epoch": 2.730769230769231, "grad_norm": 1.6357184648513794, "kl": 0.4222680926322937, "learning_rate": 6.955994297540947e-07, "loss": 0.0169, "reward": 3.300478458404541, "reward_std": 0.9898151755332947, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.47547852993011475, "rewards/reward_search_strategy": 0.20000000298023224, "step": 781 }, { "completion_length": 461.125, "epoch": 2.734265734265734, "grad_norm": 1.0644277334213257, "kl": 0.5362391471862793, "learning_rate": 6.895703464063319e-07, "loss": 0.0214, "reward": 1.9209330081939697, "reward_std": 1.2699564695358276, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3959329128265381, "rewards/reward_search_strategy": 0.15000000596046448, "step": 782 }, { "completion_length": 314.125, "epoch": 2.737762237762238, "grad_norm": 1.257867455482483, "kl": 0.33695724606513977, "learning_rate": 6.835633225725604e-07, "loss": 0.0135, "reward": 2.7970452308654785, "reward_std": 0.8000296354293823, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.34704503417015076, "rewards/reward_search_strategy": 0.20000000298023224, "step": 783 }, { "completion_length": 200.0, "epoch": 2.7412587412587412, "grad_norm": 3.777306079864502, "kl": 0.3662494719028473, "learning_rate": 6.775784314464717e-07, "loss": 0.0146, "reward": 3.5017638206481934, "reward_std": 0.633737325668335, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.47676384449005127, "rewards/reward_search_strategy": 0.15000000596046448, "step": 784 }, { "completion_length": 261.75, "epoch": 2.744755244755245, "grad_norm": 2.3273465633392334, "kl": 1.1071792840957642, "learning_rate": 6.716157459520739e-07, "loss": 0.0443, "reward": 1.6476645469665527, "reward_std": 1.7869136333465576, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.24766448140144348, "rewards/reward_search_strategy": 0.15000000596046448, "step": 785 }, { "completion_length": 233.5, "epoch": 2.7482517482517483, "grad_norm": 4.439958095550537, "kl": 1.2087188959121704, "learning_rate": 6.656753387428089e-07, "loss": 0.0483, "reward": 2.8876936435699463, "reward_std": 1.3056769371032715, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.18769364058971405, "rewards/reward_search_strategy": 0.20000001788139343, "step": 786 }, { "completion_length": 148.5, "epoch": 2.7517482517482517, "grad_norm": 3.226747751235962, "kl": 1.4384397268295288, "learning_rate": 6.597572822006643e-07, "loss": 0.0575, "reward": 3.148876190185547, "reward_std": 2.1138596534729004, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.12387635558843613, "rewards/reward_search_strategy": 0.15000000596046448, "step": 787 }, { "completion_length": 529.125, "epoch": 2.755244755244755, "grad_norm": 0.5552653670310974, "kl": 0.23504666984081268, "learning_rate": 6.538616484352902e-07, "loss": 0.0094, "reward": 2.5502982139587402, "reward_std": 1.2091935873031616, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5502983331680298, "rewards/reward_search_strategy": 0.125, "step": 788 }, { "completion_length": 166.5, "epoch": 2.7587412587412588, "grad_norm": 3.255779504776001, "kl": 0.8359960317611694, "learning_rate": 6.479885092831251e-07, "loss": 0.0334, "reward": 2.719755172729492, "reward_std": 1.7059824466705322, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.44475507736206055, "rewards/reward_search_strategy": 0.15000000596046448, "step": 789 }, { "completion_length": 416.75, "epoch": 2.762237762237762, "grad_norm": 2.2188446521759033, "kl": 0.3388492465019226, "learning_rate": 6.421379363065142e-07, "loss": 0.0136, "reward": 3.058757781982422, "reward_std": 0.9530322551727295, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.33375751972198486, "rewards/reward_search_strategy": 0.22500000894069672, "step": 790 }, { "completion_length": 469.5, "epoch": 2.765734265734266, "grad_norm": 0.4612925946712494, "kl": 0.3347489833831787, "learning_rate": 6.363100007928447e-07, "loss": 0.0134, "reward": 2.95046329498291, "reward_std": 1.2049694061279297, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.12546327710151672, "rewards/reward_search_strategy": 0.20000001788139343, "step": 791 }, { "completion_length": 596.125, "epoch": 2.769230769230769, "grad_norm": 3.1505653858184814, "kl": 1.0356903076171875, "learning_rate": 6.305047737536707e-07, "loss": 0.0414, "reward": 1.7035177946090698, "reward_std": 1.193629503250122, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.15351778268814087, "rewards/reward_search_strategy": 0.05000000074505806, "step": 792 }, { "completion_length": 215.375, "epoch": 2.7727272727272725, "grad_norm": 3.159670114517212, "kl": 0.5138453841209412, "learning_rate": 6.247223259238511e-07, "loss": 0.0206, "reward": 3.344359874725342, "reward_std": 1.897449016571045, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.14435996115207672, "rewards/reward_search_strategy": 0.20000000298023224, "step": 793 }, { "completion_length": 420.25, "epoch": 2.7762237762237763, "grad_norm": 3.727031946182251, "kl": 0.41760408878326416, "learning_rate": 6.189627277606894e-07, "loss": 0.0167, "reward": 1.8353703022003174, "reward_std": 1.3640153408050537, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.31037023663520813, "rewards/reward_search_strategy": 0.15000000596046448, "step": 794 }, { "completion_length": 791.75, "epoch": 2.7797202797202796, "grad_norm": 0.6065571308135986, "kl": 0.25871652364730835, "learning_rate": 6.1322604944307e-07, "loss": 0.0103, "reward": 2.9146928787231445, "reward_std": 1.0075814723968506, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.3896929621696472, "rewards/reward_search_strategy": 0.2750000059604645, "step": 795 }, { "completion_length": 124.375, "epoch": 2.7832167832167833, "grad_norm": 2.5715625286102295, "kl": 1.1604820489883423, "learning_rate": 6.075123608706093e-07, "loss": 0.0464, "reward": 3.2097158432006836, "reward_std": 2.8865416049957275, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.33471575379371643, "rewards/reward_search_strategy": 0.125, "step": 796 }, { "completion_length": 395.875, "epoch": 2.7867132867132867, "grad_norm": 0.8253587484359741, "kl": 0.43984436988830566, "learning_rate": 6.01821731662798e-07, "loss": 0.0176, "reward": 2.630898952484131, "reward_std": 1.3247750997543335, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.25589874386787415, "rewards/reward_search_strategy": 0.125, "step": 797 }, { "completion_length": 248.5, "epoch": 2.79020979020979, "grad_norm": 1.1423490047454834, "kl": 0.2707673907279968, "learning_rate": 5.961542311581586e-07, "loss": 0.0108, "reward": 2.9681153297424316, "reward_std": 0.8161411285400391, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.3431151509284973, "rewards/reward_search_strategy": 0.125, "step": 798 }, { "completion_length": 336.25, "epoch": 2.7937062937062938, "grad_norm": 2.9380545616149902, "kl": 0.2909378409385681, "learning_rate": 5.905099284133953e-07, "loss": 0.0116, "reward": 2.340406894683838, "reward_std": 1.1120237112045288, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4404069185256958, "rewards/reward_search_strategy": 0.15000000596046448, "step": 799 }, { "completion_length": 195.375, "epoch": 2.797202797202797, "grad_norm": 6.42050313949585, "kl": 0.44025787711143494, "learning_rate": 5.848888922025553e-07, "loss": 0.0176, "reward": 1.9848580360412598, "reward_std": 1.1959803104400635, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1848580241203308, "rewards/reward_search_strategy": 0.17500001192092896, "step": 800 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }