{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6993006993006993, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 145.875, "epoch": 0.0034965034965034965, "grad_norm": 8.602514266967773, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 0.7051772475242615, "reward_std": 0.8263505697250366, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3301772475242615, "rewards/reward_search_strategy": 0.0, "step": 1 }, { "completion_length": 88.125, "epoch": 0.006993006993006993, "grad_norm": 1.4985997676849365, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": -0.0, "reward": 1.8388628959655762, "reward_std": 2.5329272747039795, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2388630509376526, "rewards/reward_search_strategy": 0.10000000149011612, "step": 2 }, { "completion_length": 296.5, "epoch": 0.01048951048951049, "grad_norm": 0.7674520611763, "kl": 0.0006381775019690394, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 1.654070258140564, "reward_std": 2.827007532119751, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.17907029390335083, "rewards/reward_search_strategy": 0.22499999403953552, "step": 3 }, { "completion_length": 290.125, "epoch": 0.013986013986013986, "grad_norm": 1.0457236766815186, "kl": 0.0007995082996785641, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 3.421433448791504, "reward_std": 2.9866631031036377, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.396433562040329, "rewards/reward_search_strategy": 0.2750000059604645, "step": 4 }, { "completion_length": 259.125, "epoch": 0.017482517482517484, "grad_norm": 1.3121755123138428, "kl": 0.000728036102373153, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.9268761873245239, "reward_std": 0.7560767531394958, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2268761545419693, "rewards/reward_search_strategy": 0.07500000298023224, "step": 5 }, { "completion_length": 383.5, "epoch": 0.02097902097902098, "grad_norm": 0.9658453464508057, "kl": 0.0007470359560102224, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 1.4594597816467285, "reward_std": 2.2520837783813477, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.25945988297462463, "rewards/reward_search_strategy": 0.07500000298023224, "step": 6 }, { "completion_length": 144.375, "epoch": 0.024475524475524476, "grad_norm": 1.1642646789550781, "kl": 0.0009361266857013106, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": 1.378964900970459, "reward_std": 2.0963218212127686, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30396491289138794, "rewards/reward_search_strategy": 0.07500000298023224, "step": 7 }, { "completion_length": 314.375, "epoch": 0.027972027972027972, "grad_norm": 5.21930456161499, "kl": 0.0008346753311343491, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 1.181166410446167, "reward_std": 1.6016024351119995, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3561664819717407, "rewards/reward_search_strategy": 0.07500000298023224, "step": 8 }, { "completion_length": 98.375, "epoch": 0.03146853146853147, "grad_norm": 1.6168773174285889, "kl": 0.0009058149298653007, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 1.141206979751587, "reward_std": 2.1089890003204346, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16620691120624542, "rewards/reward_search_strategy": 0.10000000149011612, "step": 9 }, { "completion_length": 269.625, "epoch": 0.03496503496503497, "grad_norm": 1.310072660446167, "kl": 0.000750248203985393, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 1.901651382446289, "reward_std": 3.08128023147583, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.27665144205093384, "rewards/reward_search_strategy": 0.125, "step": 10 }, { "completion_length": 311.125, "epoch": 0.038461538461538464, "grad_norm": 1.1051065921783447, "kl": 0.000834438658785075, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 2.1270594596862793, "reward_std": 2.309929132461548, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.377059668302536, "rewards/reward_search_strategy": 0.125, "step": 11 }, { "completion_length": 182.375, "epoch": 0.04195804195804196, "grad_norm": 1.3039528131484985, "kl": 0.0007587745785713196, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.24762853980064392, "reward_std": 0.28513866662979126, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.17262855172157288, "rewards/reward_search_strategy": 0.07500000298023224, "step": 12 }, { "completion_length": 198.5, "epoch": 0.045454545454545456, "grad_norm": 1.176461935043335, "kl": 0.0008510244661010802, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 2.0060770511627197, "reward_std": 2.2957346439361572, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3060770034790039, "rewards/reward_search_strategy": 0.07500000298023224, "step": 13 }, { "completion_length": 158.25, "epoch": 0.04895104895104895, "grad_norm": 1.1484898328781128, "kl": 0.0009028888889588416, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": 0.2598831355571747, "reward_std": 0.4156216084957123, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1348831057548523, "rewards/reward_search_strategy": 0.0, "step": 14 }, { "completion_length": 321.125, "epoch": 0.05244755244755245, "grad_norm": 1.08210027217865, "kl": 0.0007499511120840907, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 2.711674213409424, "reward_std": 3.2537524700164795, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3866744041442871, "rewards/reward_search_strategy": 0.20000000298023224, "step": 15 }, { "completion_length": 176.0, "epoch": 0.055944055944055944, "grad_norm": 1.007460117340088, "kl": 0.0008281145128421485, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 1.6642776727676392, "reward_std": 3.1836605072021484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1892777681350708, "rewards/reward_search_strategy": 0.10000000149011612, "step": 16 }, { "completion_length": 299.0, "epoch": 0.05944055944055944, "grad_norm": 0.9308914542198181, "kl": 0.0006533896084874868, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 2.9600870609283447, "reward_std": 3.002318859100342, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.38508710265159607, "rewards/reward_search_strategy": 0.07500000298023224, "step": 17 }, { "completion_length": 170.125, "epoch": 0.06293706293706294, "grad_norm": 1.6546720266342163, "kl": 0.0009436010150238872, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": 1.807711124420166, "reward_std": 2.873779773712158, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.33271118998527527, "rewards/reward_search_strategy": 0.22499999403953552, "step": 18 }, { "completion_length": 377.125, "epoch": 0.06643356643356643, "grad_norm": 0.7977221608161926, "kl": 0.0008231342071667314, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 0.7038719058036804, "reward_std": 1.2563936710357666, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.22887186706066132, "rewards/reward_search_strategy": 0.10000000149011612, "step": 19 }, { "completion_length": 161.625, "epoch": 0.06993006993006994, "grad_norm": 1.5974496603012085, "kl": 0.000790413178037852, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 2.404862880706787, "reward_std": 2.712550163269043, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.179862841963768, "rewards/reward_search_strategy": 0.10000000149011612, "step": 20 }, { "completion_length": 298.875, "epoch": 0.07342657342657342, "grad_norm": 0.8817616105079651, "kl": 0.0007674504304304719, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": 2.6999998092651367, "reward_std": 3.122270345687866, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.05000000074505806, "rewards/reward_search_strategy": 0.2750000059604645, "step": 21 }, { "completion_length": 150.125, "epoch": 0.07692307692307693, "grad_norm": 1.345902919769287, "kl": 0.0010101046646013856, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 1.2625212669372559, "reward_std": 1.9802589416503906, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.13752126693725586, "rewards/reward_search_strategy": 0.0, "step": 22 }, { "completion_length": 156.625, "epoch": 0.08041958041958042, "grad_norm": 3.1841650009155273, "kl": 0.0009282200480811298, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": 2.532832145690918, "reward_std": 3.623222827911377, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.18283231556415558, "rewards/reward_search_strategy": 0.22500000894069672, "step": 23 }, { "completion_length": 230.625, "epoch": 0.08391608391608392, "grad_norm": 0.560590922832489, "kl": 0.0008200581069104373, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.7196725606918335, "reward_std": 1.5131947994232178, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1946725845336914, "rewards/reward_search_strategy": 0.02500000037252903, "step": 24 }, { "completion_length": 232.5, "epoch": 0.08741258741258741, "grad_norm": 5.627699375152588, "kl": 0.0007359281880781054, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 2.3161444664001465, "reward_std": 2.266465425491333, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.31614452600479126, "rewards/reward_search_strategy": 0.0, "step": 25 }, { "completion_length": 256.625, "epoch": 0.09090909090909091, "grad_norm": 1.243622064590454, "kl": 0.0009776452789083123, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 1.8542793989181519, "reward_std": 2.3880927562713623, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.22927939891815186, "rewards/reward_search_strategy": 0.0, "step": 26 }, { "completion_length": 221.625, "epoch": 0.0944055944055944, "grad_norm": 2.455430030822754, "kl": 0.0014260835014283657, "learning_rate": 1.3500000000000002e-06, "loss": 0.0001, "reward": 1.0012282133102417, "reward_std": 1.7000349760055542, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.15122826397418976, "rewards/reward_search_strategy": 0.10000000149011612, "step": 27 }, { "completion_length": 185.75, "epoch": 0.0979020979020979, "grad_norm": 1.199497938156128, "kl": 0.0007826727814972401, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 2.173332691192627, "reward_std": 2.801347255706787, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.32333269715309143, "rewards/reward_search_strategy": 0.10000000149011612, "step": 28 }, { "completion_length": 243.125, "epoch": 0.10139860139860139, "grad_norm": 0.8430306315422058, "kl": 0.0008962135761976242, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 1.0625323057174683, "reward_std": 1.898645043373108, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18753226101398468, "rewards/reward_search_strategy": 0.0, "step": 29 }, { "completion_length": 334.875, "epoch": 0.1048951048951049, "grad_norm": 0.8412113785743713, "kl": 0.001170428702607751, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 1.303612232208252, "reward_std": 2.1826775074005127, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.27861225605010986, "rewards/reward_search_strategy": 0.02500000037252903, "step": 30 }, { "completion_length": 165.75, "epoch": 0.10839160839160839, "grad_norm": 1.9105679988861084, "kl": 0.0031302073039114475, "learning_rate": 1.5500000000000002e-06, "loss": 0.0001, "reward": 1.0198408365249634, "reward_std": 2.1316380500793457, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16984085738658905, "rewards/reward_search_strategy": 0.10000000149011612, "step": 31 }, { "completion_length": 174.625, "epoch": 0.11188811188811189, "grad_norm": 2.309382915496826, "kl": 0.0032356895972043276, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "reward": 0.9216341376304626, "reward_std": 1.8671420812606812, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14663413166999817, "rewards/reward_search_strategy": 0.02500000037252903, "step": 32 }, { "completion_length": 85.75, "epoch": 0.11538461538461539, "grad_norm": 1.4122040271759033, "kl": 0.0017049856251105666, "learning_rate": 1.6500000000000003e-06, "loss": 0.0001, "reward": 1.524999976158142, "reward_std": 2.2926902770996094, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.10000000149011612, "rewards/reward_search_strategy": 0.05000000074505806, "step": 33 }, { "completion_length": 173.5, "epoch": 0.11888111888111888, "grad_norm": 0.9712541699409485, "kl": 0.0014842856908217072, "learning_rate": 1.7000000000000002e-06, "loss": 0.0001, "reward": 3.4844837188720703, "reward_std": 3.1145381927490234, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3094840347766876, "rewards/reward_search_strategy": 0.17499999701976776, "step": 34 }, { "completion_length": 268.25, "epoch": 0.12237762237762238, "grad_norm": 1.1684075593948364, "kl": 0.0012478481512516737, "learning_rate": 1.75e-06, "loss": 0.0, "reward": 2.8716678619384766, "reward_std": 2.387646198272705, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3716679811477661, "rewards/reward_search_strategy": 0.125, "step": 35 }, { "completion_length": 235.125, "epoch": 0.1258741258741259, "grad_norm": 1.2749513387680054, "kl": 0.004472827073186636, "learning_rate": 1.8000000000000001e-06, "loss": 0.0002, "reward": 1.8775699138641357, "reward_std": 2.413510322570801, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30256983637809753, "rewards/reward_search_strategy": 0.07500000298023224, "step": 36 }, { "completion_length": 175.5, "epoch": 0.12937062937062938, "grad_norm": 0.8929537534713745, "kl": 0.0037049425300210714, "learning_rate": 1.85e-06, "loss": 0.0001, "reward": 0.6821428537368774, "reward_std": 1.4377206563949585, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18214285373687744, "rewards/reward_search_strategy": 0.0, "step": 37 }, { "completion_length": 290.125, "epoch": 0.13286713286713286, "grad_norm": 2.52665114402771, "kl": 0.005157872103154659, "learning_rate": 1.9000000000000002e-06, "loss": 0.0002, "reward": 2.574741840362549, "reward_std": 2.5146002769470215, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14974190294742584, "rewards/reward_search_strategy": 0.17500001192092896, "step": 38 }, { "completion_length": 344.125, "epoch": 0.13636363636363635, "grad_norm": 0.8259175419807434, "kl": 0.002582128159701824, "learning_rate": 1.9500000000000004e-06, "loss": 0.0001, "reward": 2.376612901687622, "reward_std": 2.9910624027252197, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4266127943992615, "rewards/reward_search_strategy": 0.07500000298023224, "step": 39 }, { "completion_length": 243.125, "epoch": 0.13986013986013987, "grad_norm": 1.0021271705627441, "kl": 0.004831339232623577, "learning_rate": 2.0000000000000003e-06, "loss": 0.0002, "reward": 2.04349946975708, "reward_std": 2.3903656005859375, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.34349945187568665, "rewards/reward_search_strategy": 0.07500000298023224, "step": 40 }, { "completion_length": 207.375, "epoch": 0.14335664335664336, "grad_norm": 1.2446260452270508, "kl": 0.005710378754884005, "learning_rate": 2.05e-06, "loss": 0.0002, "reward": 2.8324475288391113, "reward_std": 3.167931079864502, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.25744765996932983, "rewards/reward_search_strategy": 0.20000000298023224, "step": 41 }, { "completion_length": 441.125, "epoch": 0.14685314685314685, "grad_norm": 0.8263920545578003, "kl": 0.0030913222581148148, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "reward": 2.5794529914855957, "reward_std": 2.6344144344329834, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30445292592048645, "rewards/reward_search_strategy": 0.15000000596046448, "step": 42 }, { "completion_length": 226.375, "epoch": 0.15034965034965034, "grad_norm": 1.141964316368103, "kl": 0.008909309282898903, "learning_rate": 2.15e-06, "loss": 0.0004, "reward": 1.8603672981262207, "reward_std": 2.203197479248047, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.21036729216575623, "rewards/reward_search_strategy": 0.02500000037252903, "step": 43 }, { "completion_length": 397.75, "epoch": 0.15384615384615385, "grad_norm": 0.9564006328582764, "kl": 0.01092858798801899, "learning_rate": 2.2e-06, "loss": 0.0004, "reward": 2.557056188583374, "reward_std": 2.429002046585083, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.28205621242523193, "rewards/reward_search_strategy": 0.02500000037252903, "step": 44 }, { "completion_length": 307.25, "epoch": 0.15734265734265734, "grad_norm": 1.0510841608047485, "kl": 0.007918575778603554, "learning_rate": 2.25e-06, "loss": 0.0003, "reward": 1.2331278324127197, "reward_std": 1.7548075914382935, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.15812794864177704, "rewards/reward_search_strategy": 0.07500000298023224, "step": 45 }, { "completion_length": 277.0, "epoch": 0.16083916083916083, "grad_norm": 0.9169591665267944, "kl": 0.015062836930155754, "learning_rate": 2.3000000000000004e-06, "loss": 0.0006, "reward": 1.2089338302612305, "reward_std": 2.5115318298339844, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1339338719844818, "rewards/reward_search_strategy": 0.20000000298023224, "step": 46 }, { "completion_length": 206.25, "epoch": 0.16433566433566432, "grad_norm": 1.3427180051803589, "kl": 0.019624339416623116, "learning_rate": 2.35e-06, "loss": 0.0008, "reward": 4.1133527755737305, "reward_std": 3.2583820819854736, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.36335283517837524, "rewards/reward_search_strategy": 0.125, "step": 47 }, { "completion_length": 252.125, "epoch": 0.16783216783216784, "grad_norm": 0.7444542050361633, "kl": 0.021463895216584206, "learning_rate": 2.4000000000000003e-06, "loss": 0.0009, "reward": 1.631263017654419, "reward_std": 2.5925517082214355, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.23126289248466492, "rewards/reward_search_strategy": 0.02500000037252903, "step": 48 }, { "completion_length": 117.625, "epoch": 0.17132867132867133, "grad_norm": 1.888673186302185, "kl": 0.03609447553753853, "learning_rate": 2.4500000000000003e-06, "loss": 0.0014, "reward": 1.9036250114440918, "reward_std": 2.2273151874542236, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.20362500846385956, "rewards/reward_search_strategy": 0.07500000298023224, "step": 49 }, { "completion_length": 302.75, "epoch": 0.17482517482517482, "grad_norm": 1.2723031044006348, "kl": 0.2012804001569748, "learning_rate": 2.5e-06, "loss": 0.0081, "reward": 0.15956448018550873, "reward_std": 0.4513165056705475, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.03456447646021843, "rewards/reward_search_strategy": 0.0, "step": 50 }, { "completion_length": 222.0, "epoch": 0.17832167832167833, "grad_norm": 0.8149501085281372, "kl": 0.04784020781517029, "learning_rate": 2.55e-06, "loss": 0.0019, "reward": 0.49046826362609863, "reward_std": 0.9328532814979553, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1654682457447052, "rewards/reward_search_strategy": 0.07500000298023224, "step": 51 }, { "completion_length": 329.75, "epoch": 0.18181818181818182, "grad_norm": 1.0090314149856567, "kl": 0.03431350365281105, "learning_rate": 2.6e-06, "loss": 0.0014, "reward": 1.8717395067214966, "reward_std": 2.0185225009918213, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.246739462018013, "rewards/reward_search_strategy": 0.25, "step": 52 }, { "completion_length": 292.5, "epoch": 0.1853146853146853, "grad_norm": 0.9738456606864929, "kl": 0.03869509696960449, "learning_rate": 2.6500000000000005e-06, "loss": 0.0015, "reward": 3.411203384399414, "reward_std": 2.776095390319824, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.28620344400405884, "rewards/reward_search_strategy": 0.0, "step": 53 }, { "completion_length": 262.25, "epoch": 0.1888111888111888, "grad_norm": 1.0264172554016113, "kl": 0.04221673682332039, "learning_rate": 2.7000000000000004e-06, "loss": 0.0017, "reward": 3.2230210304260254, "reward_std": 3.179140329360962, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.2730211913585663, "rewards/reward_search_strategy": 0.32500001788139343, "step": 54 }, { "completion_length": 278.0, "epoch": 0.19230769230769232, "grad_norm": 1.219490885734558, "kl": 0.07308313995599747, "learning_rate": 2.7500000000000004e-06, "loss": 0.0029, "reward": 2.9714736938476562, "reward_std": 3.078878164291382, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.32147371768951416, "rewards/reward_search_strategy": 0.2750000059604645, "step": 55 }, { "completion_length": 198.125, "epoch": 0.1958041958041958, "grad_norm": 2.261676788330078, "kl": 0.10734312981367111, "learning_rate": 2.8000000000000003e-06, "loss": 0.0043, "reward": 1.751394271850586, "reward_std": 2.916086196899414, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.35139432549476624, "rewards/reward_search_strategy": 0.15000000596046448, "step": 56 }, { "completion_length": 368.25, "epoch": 0.1993006993006993, "grad_norm": 1.0139119625091553, "kl": 0.07945723086595535, "learning_rate": 2.85e-06, "loss": 0.0032, "reward": 2.6666977405548096, "reward_std": 3.0744428634643555, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1416977047920227, "rewards/reward_search_strategy": 0.15000000596046448, "step": 57 }, { "completion_length": 445.875, "epoch": 0.20279720279720279, "grad_norm": 1.2591164112091064, "kl": 0.0817643404006958, "learning_rate": 2.9e-06, "loss": 0.0033, "reward": 3.105250358581543, "reward_std": 2.503899574279785, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4302505850791931, "rewards/reward_search_strategy": 0.17499999701976776, "step": 58 }, { "completion_length": 465.875, "epoch": 0.2062937062937063, "grad_norm": 0.6122844219207764, "kl": 0.0703798234462738, "learning_rate": 2.95e-06, "loss": 0.0028, "reward": 1.6243177652359009, "reward_std": 1.8378466367721558, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.37431782484054565, "rewards/reward_search_strategy": 0.125, "step": 59 }, { "completion_length": 256.375, "epoch": 0.2097902097902098, "grad_norm": 0.9197457432746887, "kl": 0.08273329585790634, "learning_rate": 3e-06, "loss": 0.0033, "reward": 2.420247793197632, "reward_std": 2.3122122287750244, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4202479422092438, "rewards/reward_search_strategy": 0.25, "step": 60 }, { "completion_length": 517.125, "epoch": 0.21328671328671328, "grad_norm": 0.8799907565116882, "kl": 0.20334148406982422, "learning_rate": 3.05e-06, "loss": 0.0081, "reward": 1.377384066581726, "reward_std": 1.68935227394104, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3023841083049774, "rewards/reward_search_strategy": 0.07500000298023224, "step": 61 }, { "completion_length": 502.5, "epoch": 0.21678321678321677, "grad_norm": 0.7109338641166687, "kl": 0.03114498406648636, "learning_rate": 3.1000000000000004e-06, "loss": 0.0012, "reward": 5.466271877288818, "reward_std": 2.7285211086273193, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.6912716627120972, "rewards/reward_search_strategy": 0.4000000059604645, "step": 62 }, { "completion_length": 503.25, "epoch": 0.2202797202797203, "grad_norm": 0.9971331357955933, "kl": 0.20209850370883942, "learning_rate": 3.1500000000000003e-06, "loss": 0.0081, "reward": 1.2392462491989136, "reward_std": 1.4006211757659912, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.189246267080307, "rewards/reward_search_strategy": 0.17500001192092896, "step": 63 }, { "completion_length": 288.5, "epoch": 0.22377622377622378, "grad_norm": 0.8857221603393555, "kl": 0.10349184274673462, "learning_rate": 3.2000000000000003e-06, "loss": 0.0041, "reward": 1.3573650121688843, "reward_std": 1.9148555994033813, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.40736502408981323, "rewards/reward_search_strategy": 0.07500000298023224, "step": 64 }, { "completion_length": 445.875, "epoch": 0.22727272727272727, "grad_norm": 2.050611734390259, "kl": 0.9011820554733276, "learning_rate": 3.2500000000000002e-06, "loss": 0.036, "reward": 2.7885334491729736, "reward_std": 2.6393096446990967, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.46353352069854736, "rewards/reward_search_strategy": 0.20000000298023224, "step": 65 }, { "completion_length": 321.75, "epoch": 0.23076923076923078, "grad_norm": 0.8712911605834961, "kl": 0.11524824798107147, "learning_rate": 3.3000000000000006e-06, "loss": 0.0046, "reward": 2.963777542114258, "reward_std": 2.7310569286346436, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.21377724409103394, "rewards/reward_search_strategy": 0.125, "step": 66 }, { "completion_length": 579.5, "epoch": 0.23426573426573427, "grad_norm": 0.6312026977539062, "kl": 0.08477935940027237, "learning_rate": 3.3500000000000005e-06, "loss": 0.0034, "reward": 4.128048896789551, "reward_std": 2.959887742996216, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.30304890871047974, "rewards/reward_search_strategy": 0.20000000298023224, "step": 67 }, { "completion_length": 256.625, "epoch": 0.23776223776223776, "grad_norm": 0.835229218006134, "kl": 0.1031595766544342, "learning_rate": 3.4000000000000005e-06, "loss": 0.0041, "reward": 5.231247425079346, "reward_std": 2.7173686027526855, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.48124733567237854, "rewards/reward_search_strategy": 0.25, "step": 68 }, { "completion_length": 458.125, "epoch": 0.24125874125874125, "grad_norm": 1.0669630765914917, "kl": 0.050319548696279526, "learning_rate": 3.45e-06, "loss": 0.002, "reward": 2.627213716506958, "reward_std": 2.752366781234741, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1022137925028801, "rewards/reward_search_strategy": 0.4000000059604645, "step": 69 }, { "completion_length": 758.125, "epoch": 0.24475524475524477, "grad_norm": 0.4896698594093323, "kl": 0.03787853941321373, "learning_rate": 3.5e-06, "loss": 0.0015, "reward": 3.715177059173584, "reward_std": 2.108518362045288, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.44017714262008667, "rewards/reward_search_strategy": 0.3999999761581421, "step": 70 }, { "completion_length": 359.625, "epoch": 0.24825174825174826, "grad_norm": 1.1720679998397827, "kl": 0.09393578767776489, "learning_rate": 3.5500000000000003e-06, "loss": 0.0038, "reward": 5.1651530265808105, "reward_std": 2.7064919471740723, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.39015328884124756, "rewards/reward_search_strategy": 0.5249999761581421, "step": 71 }, { "completion_length": 603.75, "epoch": 0.2517482517482518, "grad_norm": 0.92381352186203, "kl": 0.03434322774410248, "learning_rate": 3.6000000000000003e-06, "loss": 0.0014, "reward": 1.6090435981750488, "reward_std": 1.3871439695358276, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.5090435743331909, "rewards/reward_search_strategy": 0.22500000894069672, "step": 72 }, { "completion_length": 324.125, "epoch": 0.25524475524475526, "grad_norm": 0.8183465003967285, "kl": 0.09460737556219101, "learning_rate": 3.65e-06, "loss": 0.0038, "reward": 1.5781946182250977, "reward_std": 2.772611618041992, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.25319457054138184, "rewards/reward_search_strategy": 0.07500000298023224, "step": 73 }, { "completion_length": 479.5, "epoch": 0.25874125874125875, "grad_norm": 0.8604242205619812, "kl": 0.053619399666786194, "learning_rate": 3.7e-06, "loss": 0.0021, "reward": 4.117947578430176, "reward_std": 2.7907135486602783, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.41794782876968384, "rewards/reward_search_strategy": 0.20000000298023224, "step": 74 }, { "completion_length": 450.25, "epoch": 0.26223776223776224, "grad_norm": 2.1643834114074707, "kl": 0.19324736297130585, "learning_rate": 3.7500000000000005e-06, "loss": 0.0077, "reward": 4.815071105957031, "reward_std": 3.0326616764068604, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39007121324539185, "rewards/reward_search_strategy": 0.42500001192092896, "step": 75 }, { "completion_length": 412.25, "epoch": 0.26573426573426573, "grad_norm": 0.8062530159950256, "kl": 0.0793648362159729, "learning_rate": 3.8000000000000005e-06, "loss": 0.0032, "reward": 3.5049312114715576, "reward_std": 2.922109603881836, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5549312829971313, "rewards/reward_search_strategy": 0.20000000298023224, "step": 76 }, { "completion_length": 369.75, "epoch": 0.2692307692307692, "grad_norm": 1.467880368232727, "kl": 0.0835878923535347, "learning_rate": 3.85e-06, "loss": 0.0033, "reward": 2.802757978439331, "reward_std": 2.5307087898254395, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3277580440044403, "rewards/reward_search_strategy": 0.22500000894069672, "step": 77 }, { "completion_length": 382.375, "epoch": 0.2727272727272727, "grad_norm": 4.771082401275635, "kl": 0.9713490605354309, "learning_rate": 3.900000000000001e-06, "loss": 0.0389, "reward": 2.052755117416382, "reward_std": 3.0059754848480225, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3777550160884857, "rewards/reward_search_strategy": 0.30000001192092896, "step": 78 }, { "completion_length": 546.125, "epoch": 0.2762237762237762, "grad_norm": 0.48559486865997314, "kl": 0.037933606654405594, "learning_rate": 3.95e-06, "loss": 0.0015, "reward": 4.67194938659668, "reward_std": 1.9323244094848633, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5219494700431824, "rewards/reward_search_strategy": 0.3999999761581421, "step": 79 }, { "completion_length": 677.25, "epoch": 0.27972027972027974, "grad_norm": 0.42382290959358215, "kl": 0.0389033704996109, "learning_rate": 4.000000000000001e-06, "loss": 0.0016, "reward": 1.6928536891937256, "reward_std": 2.6118526458740234, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.367853581905365, "rewards/reward_search_strategy": 0.20000000298023224, "step": 80 }, { "completion_length": 472.125, "epoch": 0.28321678321678323, "grad_norm": 0.8816371560096741, "kl": 0.07524556666612625, "learning_rate": 4.05e-06, "loss": 0.003, "reward": 3.7551965713500977, "reward_std": 2.652488946914673, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.4801965653896332, "rewards/reward_search_strategy": 0.15000000596046448, "step": 81 }, { "completion_length": 1057.125, "epoch": 0.2867132867132867, "grad_norm": 0.5135111808776855, "kl": 0.03152136504650116, "learning_rate": 4.1e-06, "loss": 0.0013, "reward": 3.2323527336120605, "reward_std": 1.7208728790283203, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.682352602481842, "rewards/reward_search_strategy": 0.42500001192092896, "step": 82 }, { "completion_length": 485.375, "epoch": 0.2902097902097902, "grad_norm": 0.9151804447174072, "kl": 0.059312522411346436, "learning_rate": 4.15e-06, "loss": 0.0024, "reward": 1.5200467109680176, "reward_std": 1.384164810180664, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4450467824935913, "rewards/reward_search_strategy": 0.07500000298023224, "step": 83 }, { "completion_length": 703.25, "epoch": 0.2937062937062937, "grad_norm": 0.708080530166626, "kl": 0.02395041473209858, "learning_rate": 4.2000000000000004e-06, "loss": 0.001, "reward": 4.584571838378906, "reward_std": 2.8503193855285645, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4345718026161194, "rewards/reward_search_strategy": 0.4000000059604645, "step": 84 }, { "completion_length": 507.75, "epoch": 0.2972027972027972, "grad_norm": 0.9842731356620789, "kl": 0.045889340341091156, "learning_rate": 4.25e-06, "loss": 0.0018, "reward": 6.216353893280029, "reward_std": 3.617387056350708, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4663536846637726, "rewards/reward_search_strategy": 0.75, "step": 85 }, { "completion_length": 362.75, "epoch": 0.3006993006993007, "grad_norm": 5.553736686706543, "kl": 0.31082066893577576, "learning_rate": 4.3e-06, "loss": 0.0124, "reward": 2.6352591514587402, "reward_std": 3.6418111324310303, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.18525923788547516, "rewards/reward_search_strategy": 0.32499998807907104, "step": 86 }, { "completion_length": 760.625, "epoch": 0.3041958041958042, "grad_norm": 0.40838149189949036, "kl": 0.03904329240322113, "learning_rate": 4.350000000000001e-06, "loss": 0.0016, "reward": 2.3138082027435303, "reward_std": 2.5393922328948975, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.43880823254585266, "rewards/reward_search_strategy": 0.375, "step": 87 }, { "completion_length": 233.875, "epoch": 0.3076923076923077, "grad_norm": 1.6991461515426636, "kl": 0.11344132572412491, "learning_rate": 4.4e-06, "loss": 0.0045, "reward": 5.137988090515137, "reward_std": 3.6587538719177246, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.28798776865005493, "rewards/reward_search_strategy": 0.4750000238418579, "step": 88 }, { "completion_length": 634.0, "epoch": 0.3111888111888112, "grad_norm": 1.008798599243164, "kl": 0.13237585127353668, "learning_rate": 4.450000000000001e-06, "loss": 0.0053, "reward": 2.58494234085083, "reward_std": 3.3475492000579834, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30994224548339844, "rewards/reward_search_strategy": 0.3999999761581421, "step": 89 }, { "completion_length": 673.875, "epoch": 0.3146853146853147, "grad_norm": 0.38143813610076904, "kl": 0.04431832954287529, "learning_rate": 4.5e-06, "loss": 0.0018, "reward": 3.379831552505493, "reward_std": 2.6894783973693848, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5048315525054932, "rewards/reward_search_strategy": 0.375, "step": 90 }, { "completion_length": 335.5, "epoch": 0.3181818181818182, "grad_norm": 1.326707124710083, "kl": 0.1567000150680542, "learning_rate": 4.5500000000000005e-06, "loss": 0.0063, "reward": 1.540712594985962, "reward_std": 1.9064555168151855, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.24071267247200012, "rewards/reward_search_strategy": 0.30000001192092896, "step": 91 }, { "completion_length": 479.25, "epoch": 0.32167832167832167, "grad_norm": 0.8233426213264465, "kl": 0.046305958181619644, "learning_rate": 4.600000000000001e-06, "loss": 0.0019, "reward": 4.979320526123047, "reward_std": 2.5221211910247803, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.6293203234672546, "rewards/reward_search_strategy": 0.3500000238418579, "step": 92 }, { "completion_length": 529.625, "epoch": 0.32517482517482516, "grad_norm": 0.8791444301605225, "kl": 0.049900904297828674, "learning_rate": 4.65e-06, "loss": 0.002, "reward": 4.73832893371582, "reward_std": 2.957406520843506, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5633291006088257, "rewards/reward_search_strategy": 0.550000011920929, "step": 93 }, { "completion_length": 566.25, "epoch": 0.32867132867132864, "grad_norm": 0.93157559633255, "kl": 0.10382921993732452, "learning_rate": 4.7e-06, "loss": 0.0042, "reward": 3.921233654022217, "reward_std": 3.259979486465454, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39623361825942993, "rewards/reward_search_strategy": 0.5249999761581421, "step": 94 }, { "completion_length": 405.0, "epoch": 0.3321678321678322, "grad_norm": 0.6553046703338623, "kl": 0.05014060437679291, "learning_rate": 4.75e-06, "loss": 0.002, "reward": 4.048189163208008, "reward_std": 3.046299934387207, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.3731893002986908, "rewards/reward_search_strategy": 0.550000011920929, "step": 95 }, { "completion_length": 559.0, "epoch": 0.3356643356643357, "grad_norm": 0.9019679427146912, "kl": 0.1154535710811615, "learning_rate": 4.800000000000001e-06, "loss": 0.0046, "reward": 3.3237497806549072, "reward_std": 2.755406379699707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.19874979555606842, "rewards/reward_search_strategy": 0.375, "step": 96 }, { "completion_length": 513.125, "epoch": 0.33916083916083917, "grad_norm": 1.1792876720428467, "kl": 0.20403671264648438, "learning_rate": 4.85e-06, "loss": 0.0082, "reward": 4.584352970123291, "reward_std": 3.532867908477783, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4093528091907501, "rewards/reward_search_strategy": 0.550000011920929, "step": 97 }, { "completion_length": 390.75, "epoch": 0.34265734265734266, "grad_norm": 24.102558135986328, "kl": 1.4576557874679565, "learning_rate": 4.9000000000000005e-06, "loss": 0.0583, "reward": 3.2650551795959473, "reward_std": 1.5778415203094482, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4150553047657013, "rewards/reward_search_strategy": 0.4749999940395355, "step": 98 }, { "completion_length": 992.0, "epoch": 0.34615384615384615, "grad_norm": 1.07889723777771, "kl": 0.15134873986244202, "learning_rate": 4.95e-06, "loss": 0.0061, "reward": 3.4669156074523926, "reward_std": 3.278291940689087, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.31691551208496094, "rewards/reward_search_strategy": 0.4000000059604645, "step": 99 }, { "completion_length": 839.625, "epoch": 0.34965034965034963, "grad_norm": 0.5948315262794495, "kl": 0.03611273318529129, "learning_rate": 5e-06, "loss": 0.0014, "reward": 2.273786783218384, "reward_std": 1.3839948177337646, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5487868189811707, "rewards/reward_search_strategy": 0.4750000238418579, "step": 100 }, { "completion_length": 658.5, "epoch": 0.3531468531468531, "grad_norm": 0.6043696999549866, "kl": 0.07113679498434067, "learning_rate": 4.999984769144476e-06, "loss": 0.0028, "reward": 4.308359622955322, "reward_std": 2.9922401905059814, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5333598256111145, "rewards/reward_search_strategy": 0.5250000357627869, "step": 101 }, { "completion_length": 658.375, "epoch": 0.35664335664335667, "grad_norm": 0.6331404447555542, "kl": 0.06461314111948013, "learning_rate": 4.999939076763487e-06, "loss": 0.0026, "reward": 4.334439277648926, "reward_std": 3.60390567779541, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3594394326210022, "rewards/reward_search_strategy": 0.6000000238418579, "step": 102 }, { "completion_length": 549.875, "epoch": 0.36013986013986016, "grad_norm": 0.9000433087348938, "kl": 0.18242445588111877, "learning_rate": 4.999862923413781e-06, "loss": 0.0073, "reward": 3.720637559890747, "reward_std": 3.3442742824554443, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3706376552581787, "rewards/reward_search_strategy": 0.4750000238418579, "step": 103 }, { "completion_length": 560.875, "epoch": 0.36363636363636365, "grad_norm": 0.9529889822006226, "kl": 0.13583947718143463, "learning_rate": 4.999756310023261e-06, "loss": 0.0054, "reward": 5.090951442718506, "reward_std": 3.163343906402588, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.41595128178596497, "rewards/reward_search_strategy": 0.42500001192092896, "step": 104 }, { "completion_length": 784.5, "epoch": 0.36713286713286714, "grad_norm": 0.5621578693389893, "kl": 0.06874603033065796, "learning_rate": 4.9996192378909785e-06, "loss": 0.0027, "reward": 1.6759297847747803, "reward_std": 1.3889375925064087, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4009298086166382, "rewards/reward_search_strategy": 0.2750000059604645, "step": 105 }, { "completion_length": 811.875, "epoch": 0.3706293706293706, "grad_norm": 0.3865280747413635, "kl": 0.03063635341823101, "learning_rate": 4.999451708687114e-06, "loss": 0.0012, "reward": 1.8364933729171753, "reward_std": 1.0776033401489258, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.186493381857872, "rewards/reward_search_strategy": 0.6499999761581421, "step": 106 }, { "completion_length": 767.0, "epoch": 0.3741258741258741, "grad_norm": 1.0235356092453003, "kl": 0.05863216146826744, "learning_rate": 4.9992537244529585e-06, "loss": 0.0023, "reward": 3.36350154876709, "reward_std": 2.900977611541748, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.43850138783454895, "rewards/reward_search_strategy": 0.6749999523162842, "step": 107 }, { "completion_length": 471.125, "epoch": 0.3776223776223776, "grad_norm": 1.1427947282791138, "kl": 0.05708540976047516, "learning_rate": 4.999025287600886e-06, "loss": 0.0023, "reward": 3.4347455501556396, "reward_std": 3.3557960987091064, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.28474536538124084, "rewards/reward_search_strategy": 0.4000000059604645, "step": 108 }, { "completion_length": 680.5, "epoch": 0.3811188811188811, "grad_norm": 0.5091893076896667, "kl": 0.06495320051908493, "learning_rate": 4.998766400914329e-06, "loss": 0.0026, "reward": 5.424283981323242, "reward_std": 3.0205914974212646, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2242841124534607, "rewards/reward_search_strategy": 0.824999988079071, "step": 109 }, { "completion_length": 546.875, "epoch": 0.38461538461538464, "grad_norm": 0.7380320429801941, "kl": 0.080781489610672, "learning_rate": 4.99847706754774e-06, "loss": 0.0032, "reward": 3.1829848289489746, "reward_std": 1.9441657066345215, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.45798495411872864, "rewards/reward_search_strategy": 0.22500000894069672, "step": 110 }, { "completion_length": 696.125, "epoch": 0.3881118881118881, "grad_norm": 0.7519396543502808, "kl": 0.0903296023607254, "learning_rate": 4.998157291026553e-06, "loss": 0.0036, "reward": 3.140177011489868, "reward_std": 2.6969096660614014, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4151769280433655, "rewards/reward_search_strategy": 0.3500000238418579, "step": 111 }, { "completion_length": 530.5, "epoch": 0.3916083916083916, "grad_norm": 1.3212255239486694, "kl": 0.17164835333824158, "learning_rate": 4.997807075247147e-06, "loss": 0.0069, "reward": 2.3374075889587402, "reward_std": 1.7672338485717773, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.41240769624710083, "rewards/reward_search_strategy": 0.30000001192092896, "step": 112 }, { "completion_length": 703.375, "epoch": 0.3951048951048951, "grad_norm": 0.48862800002098083, "kl": 0.04361271113157272, "learning_rate": 4.997426424476787e-06, "loss": 0.0017, "reward": 4.811786651611328, "reward_std": 3.4794275760650635, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.48678621649742126, "rewards/reward_search_strategy": 0.699999988079071, "step": 113 }, { "completion_length": 666.375, "epoch": 0.3986013986013986, "grad_norm": 0.7718109488487244, "kl": 0.040162038058042526, "learning_rate": 4.9970153433535855e-06, "loss": 0.0016, "reward": 3.659369468688965, "reward_std": 2.0675926208496094, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4593694806098938, "rewards/reward_search_strategy": 0.5750000476837158, "step": 114 }, { "completion_length": 640.625, "epoch": 0.4020979020979021, "grad_norm": 0.5138198137283325, "kl": 0.05123981833457947, "learning_rate": 4.9965738368864345e-06, "loss": 0.002, "reward": 2.629551410675049, "reward_std": 1.7166107892990112, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5045512914657593, "rewards/reward_search_strategy": 0.5, "step": 115 }, { "completion_length": 586.75, "epoch": 0.40559440559440557, "grad_norm": 1.4575413465499878, "kl": 0.08412657678127289, "learning_rate": 4.996101910454953e-06, "loss": 0.0034, "reward": 4.0647077560424805, "reward_std": 1.9030133485794067, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4147075116634369, "rewards/reward_search_strategy": 0.40000003576278687, "step": 116 }, { "completion_length": 514.625, "epoch": 0.4090909090909091, "grad_norm": 1.051775574684143, "kl": 0.09374314546585083, "learning_rate": 4.995599569809414e-06, "loss": 0.0037, "reward": 2.5817337036132812, "reward_std": 2.1470091342926025, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30673372745513916, "rewards/reward_search_strategy": 0.2750000059604645, "step": 117 }, { "completion_length": 814.375, "epoch": 0.4125874125874126, "grad_norm": 0.4415126144886017, "kl": 0.07567695528268814, "learning_rate": 4.9950668210706795e-06, "loss": 0.003, "reward": 2.189971685409546, "reward_std": 1.4690890312194824, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.5649716854095459, "rewards/reward_search_strategy": 0.375, "step": 118 }, { "completion_length": 550.375, "epoch": 0.4160839160839161, "grad_norm": 0.50792396068573, "kl": 0.05492626503109932, "learning_rate": 4.994503670730126e-06, "loss": 0.0022, "reward": 4.787566661834717, "reward_std": 2.5005829334259033, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6875668168067932, "rewards/reward_search_strategy": 0.6000000238418579, "step": 119 }, { "completion_length": 609.0, "epoch": 0.4195804195804196, "grad_norm": 0.6223260760307312, "kl": 0.047494076192379, "learning_rate": 4.993910125649561e-06, "loss": 0.0019, "reward": 4.249635696411133, "reward_std": 2.9354586601257324, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3746356964111328, "rewards/reward_search_strategy": 0.3750000298023224, "step": 120 }, { "completion_length": 560.875, "epoch": 0.4230769230769231, "grad_norm": 0.9563676118850708, "kl": 0.09799660742282867, "learning_rate": 4.993286193061145e-06, "loss": 0.0039, "reward": 2.565558910369873, "reward_std": 2.178126811981201, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3405587375164032, "rewards/reward_search_strategy": 0.3500000238418579, "step": 121 }, { "completion_length": 387.625, "epoch": 0.42657342657342656, "grad_norm": 0.7505446076393127, "kl": 0.14984500408172607, "learning_rate": 4.992631880567301e-06, "loss": 0.006, "reward": 1.1751203536987305, "reward_std": 1.6967118978500366, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.20012035965919495, "rewards/reward_search_strategy": 0.22500000894069672, "step": 122 }, { "completion_length": 411.875, "epoch": 0.43006993006993005, "grad_norm": 1.1681523323059082, "kl": 0.0963008776307106, "learning_rate": 4.991947196140619e-06, "loss": 0.0039, "reward": 3.0484721660614014, "reward_std": 3.416553497314453, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3984720706939697, "rewards/reward_search_strategy": 0.4000000059604645, "step": 123 }, { "completion_length": 423.75, "epoch": 0.43356643356643354, "grad_norm": 0.9286041259765625, "kl": 0.07872038334608078, "learning_rate": 4.9912321481237616e-06, "loss": 0.0031, "reward": 3.2697560787200928, "reward_std": 3.4370007514953613, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3947560787200928, "rewards/reward_search_strategy": 0.5, "step": 124 }, { "completion_length": 463.625, "epoch": 0.4370629370629371, "grad_norm": 1.0566741228103638, "kl": 0.0985063686966896, "learning_rate": 4.990486745229364e-06, "loss": 0.0039, "reward": 1.5522143840789795, "reward_std": 1.579607605934143, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.37721437215805054, "rewards/reward_search_strategy": 0.17500001192092896, "step": 125 }, { "completion_length": 685.625, "epoch": 0.4405594405594406, "grad_norm": 0.6656001806259155, "kl": 0.06633864343166351, "learning_rate": 4.989710996539926e-06, "loss": 0.0027, "reward": 2.43574857711792, "reward_std": 2.3410024642944336, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.585748553276062, "rewards/reward_search_strategy": 0.3500000238418579, "step": 126 }, { "completion_length": 637.375, "epoch": 0.44405594405594406, "grad_norm": 0.7161982655525208, "kl": 0.07665619254112244, "learning_rate": 4.9889049115077e-06, "loss": 0.0031, "reward": 3.5046868324279785, "reward_std": 2.549543619155884, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5796867609024048, "rewards/reward_search_strategy": 0.550000011920929, "step": 127 }, { "completion_length": 644.375, "epoch": 0.44755244755244755, "grad_norm": 0.5414532423019409, "kl": 0.06415767967700958, "learning_rate": 4.988068499954578e-06, "loss": 0.0026, "reward": 5.926258563995361, "reward_std": 2.7303335666656494, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6762584447860718, "rewards/reward_search_strategy": 0.625, "step": 128 }, { "completion_length": 397.25, "epoch": 0.45104895104895104, "grad_norm": 0.8846190571784973, "kl": 0.14254269003868103, "learning_rate": 4.987201772071971e-06, "loss": 0.0057, "reward": 2.7677345275878906, "reward_std": 2.424544095993042, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5427343845367432, "rewards/reward_search_strategy": 0.4750000238418579, "step": 129 }, { "completion_length": 463.875, "epoch": 0.45454545454545453, "grad_norm": 0.6526610851287842, "kl": 0.12603093683719635, "learning_rate": 4.986304738420684e-06, "loss": 0.005, "reward": 5.665769577026367, "reward_std": 2.0002214908599854, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.1657698005437851, "rewards/reward_search_strategy": 0.5, "step": 130 }, { "completion_length": 425.125, "epoch": 0.458041958041958, "grad_norm": 1.6641560792922974, "kl": 0.2266431599855423, "learning_rate": 4.985377409930789e-06, "loss": 0.0091, "reward": 4.595471382141113, "reward_std": 3.3347935676574707, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.42047110199928284, "rewards/reward_search_strategy": 0.550000011920929, "step": 131 }, { "completion_length": 708.75, "epoch": 0.46153846153846156, "grad_norm": 0.49309638142585754, "kl": 0.07724378257989883, "learning_rate": 4.984419797901491e-06, "loss": 0.0031, "reward": 2.9709227085113525, "reward_std": 0.63145512342453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.4459228515625, "rewards/reward_search_strategy": 0.5249999761581421, "step": 132 }, { "completion_length": 570.75, "epoch": 0.46503496503496505, "grad_norm": 0.8325252532958984, "kl": 0.15752217173576355, "learning_rate": 4.983431914000991e-06, "loss": 0.0063, "reward": 2.9882287979125977, "reward_std": 2.527655601501465, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.463228702545166, "rewards/reward_search_strategy": 0.5250000357627869, "step": 133 }, { "completion_length": 669.625, "epoch": 0.46853146853146854, "grad_norm": 1.057653784751892, "kl": 0.10220281034708023, "learning_rate": 4.9824137702663424e-06, "loss": 0.0041, "reward": 2.843283176422119, "reward_std": 2.226605176925659, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.3932831287384033, "rewards/reward_search_strategy": 0.45000001788139343, "step": 134 }, { "completion_length": 387.25, "epoch": 0.47202797202797203, "grad_norm": 0.7793525457382202, "kl": 0.09381990879774094, "learning_rate": 4.981365379103306e-06, "loss": 0.0038, "reward": 3.3774726390838623, "reward_std": 2.2550406455993652, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37747257947921753, "rewards/reward_search_strategy": 0.5, "step": 135 }, { "completion_length": 454.625, "epoch": 0.4755244755244755, "grad_norm": 0.6707426309585571, "kl": 0.09792742133140564, "learning_rate": 4.980286753286196e-06, "loss": 0.0039, "reward": 5.85577392578125, "reward_std": 2.534198522567749, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5807737112045288, "rewards/reward_search_strategy": 0.6499999761581421, "step": 136 }, { "completion_length": 513.0, "epoch": 0.479020979020979, "grad_norm": 0.91905677318573, "kl": 0.12576593458652496, "learning_rate": 4.979177905957726e-06, "loss": 0.005, "reward": 2.693246603012085, "reward_std": 1.9298425912857056, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.44324639439582825, "rewards/reward_search_strategy": 0.25, "step": 137 }, { "completion_length": 535.0, "epoch": 0.4825174825174825, "grad_norm": 0.6421681642532349, "kl": 0.16189059615135193, "learning_rate": 4.978038850628855e-06, "loss": 0.0065, "reward": 2.3643674850463867, "reward_std": 2.4537479877471924, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.614367663860321, "rewards/reward_search_strategy": 0.5, "step": 138 }, { "completion_length": 388.625, "epoch": 0.486013986013986, "grad_norm": 1.6358879804611206, "kl": 0.31070151925086975, "learning_rate": 4.9768696011786095e-06, "loss": 0.0124, "reward": 7.369345664978027, "reward_std": 2.027414560317993, "rewards/reward_correctness": 0.875, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.11934606730937958, "rewards/reward_search_strategy": 0.875, "step": 139 }, { "completion_length": 430.375, "epoch": 0.48951048951048953, "grad_norm": 1.3875221014022827, "kl": 0.1299194097518921, "learning_rate": 4.975670171853926e-06, "loss": 0.0052, "reward": 4.600214958190918, "reward_std": 3.4139461517333984, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4752153754234314, "rewards/reward_search_strategy": 0.625, "step": 140 }, { "completion_length": 619.375, "epoch": 0.493006993006993, "grad_norm": 1.2012630701065063, "kl": 0.07434721291065216, "learning_rate": 4.974440577269473e-06, "loss": 0.003, "reward": 3.8403759002685547, "reward_std": 2.251944065093994, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.34037598967552185, "rewards/reward_search_strategy": 0.625, "step": 141 }, { "completion_length": 662.0, "epoch": 0.4965034965034965, "grad_norm": 0.4479227066040039, "kl": 0.08045605570077896, "learning_rate": 4.973180832407471e-06, "loss": 0.0032, "reward": 4.356062889099121, "reward_std": 1.873792052268982, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.6060628294944763, "rewards/reward_search_strategy": 0.5, "step": 142 }, { "completion_length": 408.0, "epoch": 0.5, "grad_norm": 0.774638831615448, "kl": 0.16293062269687653, "learning_rate": 4.971890952617515e-06, "loss": 0.0065, "reward": 3.544522762298584, "reward_std": 3.194159507751465, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.31952300667762756, "rewards/reward_search_strategy": 0.3500000238418579, "step": 143 }, { "completion_length": 466.75, "epoch": 0.5034965034965035, "grad_norm": 0.8625693917274475, "kl": 0.10907953977584839, "learning_rate": 4.970570953616383e-06, "loss": 0.0044, "reward": 3.3587875366210938, "reward_std": 2.7467174530029297, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3837875723838806, "rewards/reward_search_strategy": 0.3500000238418579, "step": 144 }, { "completion_length": 856.125, "epoch": 0.506993006993007, "grad_norm": 0.37043091654777527, "kl": 0.05690326541662216, "learning_rate": 4.9692208514878445e-06, "loss": 0.0023, "reward": 5.424896240234375, "reward_std": 2.465031862258911, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.49989593029022217, "rewards/reward_search_strategy": 0.7999999523162842, "step": 145 }, { "completion_length": 557.25, "epoch": 0.5104895104895105, "grad_norm": 0.8445433378219604, "kl": 0.06749890744686127, "learning_rate": 4.96784066268247e-06, "loss": 0.0027, "reward": 2.7921762466430664, "reward_std": 1.2953386306762695, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5421763062477112, "rewards/reward_search_strategy": 0.625, "step": 146 }, { "completion_length": 447.5, "epoch": 0.513986013986014, "grad_norm": 0.9278783798217773, "kl": 0.10697238147258759, "learning_rate": 4.966430404017424e-06, "loss": 0.0043, "reward": 3.9682509899139404, "reward_std": 3.156663417816162, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.41825127601623535, "rewards/reward_search_strategy": 0.42500001192092896, "step": 147 }, { "completion_length": 379.625, "epoch": 0.5174825174825175, "grad_norm": 1.5938533544540405, "kl": 0.11513354629278183, "learning_rate": 4.964990092676263e-06, "loss": 0.0046, "reward": 3.4113523960113525, "reward_std": 2.4409544467926025, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.2863525450229645, "rewards/reward_search_strategy": 0.5, "step": 148 }, { "completion_length": 411.875, "epoch": 0.5209790209790209, "grad_norm": 1.070616364479065, "kl": 0.2734951078891754, "learning_rate": 4.963519746208726e-06, "loss": 0.0109, "reward": 2.4380977153778076, "reward_std": 2.1429409980773926, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.363097608089447, "rewards/reward_search_strategy": 0.20000000298023224, "step": 149 }, { "completion_length": 355.75, "epoch": 0.5244755244755245, "grad_norm": 0.9825189113616943, "kl": 0.14074762165546417, "learning_rate": 4.962019382530521e-06, "loss": 0.0056, "reward": 6.416107177734375, "reward_std": 3.2488479614257812, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6411075592041016, "rewards/reward_search_strategy": 0.7749999761581421, "step": 150 }, { "completion_length": 642.5, "epoch": 0.527972027972028, "grad_norm": 0.9618121385574341, "kl": 0.07677344232797623, "learning_rate": 4.960489019923105e-06, "loss": 0.0031, "reward": 2.5563745498657227, "reward_std": 1.2503511905670166, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4313744604587555, "rewards/reward_search_strategy": 0.625, "step": 151 }, { "completion_length": 513.375, "epoch": 0.5314685314685315, "grad_norm": 1.855919599533081, "kl": 0.21450263261795044, "learning_rate": 4.958928677033465e-06, "loss": 0.0086, "reward": 3.358847141265869, "reward_std": 1.8732644319534302, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.28384697437286377, "rewards/reward_search_strategy": 0.5750000476837158, "step": 152 }, { "completion_length": 478.375, "epoch": 0.534965034965035, "grad_norm": 0.5649963021278381, "kl": 0.059622377157211304, "learning_rate": 4.957338372873886e-06, "loss": 0.0024, "reward": 3.669475793838501, "reward_std": 3.19222354888916, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4944758117198944, "rewards/reward_search_strategy": 0.550000011920929, "step": 153 }, { "completion_length": 233.25, "epoch": 0.5384615384615384, "grad_norm": 1.1332588195800781, "kl": 0.09504832327365875, "learning_rate": 4.9557181268217225e-06, "loss": 0.0038, "reward": 3.0735678672790527, "reward_std": 2.9650135040283203, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.22356799244880676, "rewards/reward_search_strategy": 0.10000000149011612, "step": 154 }, { "completion_length": 365.75, "epoch": 0.541958041958042, "grad_norm": 0.7829000949859619, "kl": 0.12874586880207062, "learning_rate": 4.9540679586191605e-06, "loss": 0.0051, "reward": 3.2776992321014404, "reward_std": 2.2649874687194824, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4526992440223694, "rewards/reward_search_strategy": 0.44999998807907104, "step": 155 }, { "completion_length": 287.875, "epoch": 0.5454545454545454, "grad_norm": 1.0810236930847168, "kl": 0.17965207993984222, "learning_rate": 4.9523878883729794e-06, "loss": 0.0072, "reward": 5.42661714553833, "reward_std": 3.3766465187072754, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.40161705017089844, "rewards/reward_search_strategy": 0.6500000357627869, "step": 156 }, { "completion_length": 562.375, "epoch": 0.548951048951049, "grad_norm": 0.629564642906189, "kl": 0.047585126012563705, "learning_rate": 4.9506779365543054e-06, "loss": 0.0019, "reward": 3.9784889221191406, "reward_std": 0.8592731356620789, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.628489077091217, "rewards/reward_search_strategy": 0.4749999940395355, "step": 157 }, { "completion_length": 333.125, "epoch": 0.5524475524475524, "grad_norm": 0.8598470091819763, "kl": 0.23344238102436066, "learning_rate": 4.94893812399836e-06, "loss": 0.0093, "reward": 5.325415134429932, "reward_std": 2.6435444355010986, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3754151463508606, "rewards/reward_search_strategy": 0.44999998807907104, "step": 158 }, { "completion_length": 369.0, "epoch": 0.5559440559440559, "grad_norm": 1.518742322921753, "kl": 0.1705542653799057, "learning_rate": 4.947168471904213e-06, "loss": 0.0068, "reward": 5.637444972991943, "reward_std": 3.7453742027282715, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.41244518756866455, "rewards/reward_search_strategy": 0.6000000238418579, "step": 159 }, { "completion_length": 409.625, "epoch": 0.5594405594405595, "grad_norm": 1.2653708457946777, "kl": 0.2512824833393097, "learning_rate": 4.9453690018345144e-06, "loss": 0.0101, "reward": 1.8753092288970947, "reward_std": 1.6477832794189453, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.32530921697616577, "rewards/reward_search_strategy": 0.30000001192092896, "step": 160 }, { "completion_length": 766.375, "epoch": 0.5629370629370629, "grad_norm": 0.4625481367111206, "kl": 0.056169163435697556, "learning_rate": 4.9435397357152406e-06, "loss": 0.0022, "reward": 3.3957467079162598, "reward_std": 1.1029468774795532, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5957465767860413, "rewards/reward_search_strategy": 0.550000011920929, "step": 161 }, { "completion_length": 749.875, "epoch": 0.5664335664335665, "grad_norm": 0.6248977184295654, "kl": 0.07820535451173782, "learning_rate": 4.9416806958354206e-06, "loss": 0.0031, "reward": 4.84914493560791, "reward_std": 3.21990704536438, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5491449236869812, "rewards/reward_search_strategy": 0.7999999523162842, "step": 162 }, { "completion_length": 318.375, "epoch": 0.5699300699300699, "grad_norm": 2.4732108116149902, "kl": 0.2569376230239868, "learning_rate": 4.939791904846869e-06, "loss": 0.0103, "reward": 5.542656898498535, "reward_std": 3.050025224685669, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.39265698194503784, "rewards/reward_search_strategy": 0.4000000059604645, "step": 163 }, { "completion_length": 638.5, "epoch": 0.5734265734265734, "grad_norm": 0.5316995978355408, "kl": 0.06073322519659996, "learning_rate": 4.937873385763909e-06, "loss": 0.0024, "reward": 3.4646921157836914, "reward_std": 0.9867516160011292, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.614692211151123, "rewards/reward_search_strategy": 0.4750000238418579, "step": 164 }, { "completion_length": 543.0, "epoch": 0.5769230769230769, "grad_norm": 8.116938591003418, "kl": 4.153862953186035, "learning_rate": 4.935925161963089e-06, "loss": 0.1662, "reward": 4.612438678741455, "reward_std": 3.483085870742798, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.5374384522438049, "rewards/reward_search_strategy": 0.699999988079071, "step": 165 }, { "completion_length": 378.5, "epoch": 0.5804195804195804, "grad_norm": 0.9611401557922363, "kl": 0.18193311989307404, "learning_rate": 4.933947257182901e-06, "loss": 0.0073, "reward": 4.473653316497803, "reward_std": 3.3241894245147705, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.19865311682224274, "rewards/reward_search_strategy": 0.7749999761581421, "step": 166 }, { "completion_length": 465.625, "epoch": 0.583916083916084, "grad_norm": 0.7452073097229004, "kl": 0.09764686226844788, "learning_rate": 4.9319396955234925e-06, "loss": 0.0039, "reward": 4.452567100524902, "reward_std": 3.8106887340545654, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4025672674179077, "rewards/reward_search_strategy": 0.550000011920929, "step": 167 }, { "completion_length": 613.75, "epoch": 0.5874125874125874, "grad_norm": 0.5454308390617371, "kl": 0.057498060166835785, "learning_rate": 4.9299025014463665e-06, "loss": 0.0023, "reward": 2.4831161499023438, "reward_std": 0.9839221239089966, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.43311628699302673, "rewards/reward_search_strategy": 0.550000011920929, "step": 168 }, { "completion_length": 654.25, "epoch": 0.5909090909090909, "grad_norm": 0.566487193107605, "kl": 0.04927007481455803, "learning_rate": 4.92783569977409e-06, "loss": 0.002, "reward": 3.5991201400756836, "reward_std": 2.8235316276550293, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.22412018477916718, "rewards/reward_search_strategy": 0.625, "step": 169 }, { "completion_length": 295.625, "epoch": 0.5944055944055944, "grad_norm": 1.1766833066940308, "kl": 0.24150727689266205, "learning_rate": 4.925739315689991e-06, "loss": 0.0097, "reward": 2.3266749382019043, "reward_std": 3.2796859741210938, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.27667519450187683, "rewards/reward_search_strategy": 0.30000001192092896, "step": 170 }, { "completion_length": 527.125, "epoch": 0.5979020979020979, "grad_norm": 0.6295068264007568, "kl": 0.08597421646118164, "learning_rate": 4.923613374737848e-06, "loss": 0.0034, "reward": 3.203831672668457, "reward_std": 2.0840423107147217, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.6288318037986755, "rewards/reward_search_strategy": 0.574999988079071, "step": 171 }, { "completion_length": 423.5, "epoch": 0.6013986013986014, "grad_norm": 0.7889199256896973, "kl": 0.09207924455404282, "learning_rate": 4.921457902821578e-06, "loss": 0.0037, "reward": 3.5908405780792236, "reward_std": 2.698184013366699, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3158404231071472, "rewards/reward_search_strategy": 0.40000003576278687, "step": 172 }, { "completion_length": 515.875, "epoch": 0.6048951048951049, "grad_norm": 1.1472229957580566, "kl": 0.08138255774974823, "learning_rate": 4.9192729262049285e-06, "loss": 0.0033, "reward": 2.5571367740631104, "reward_std": 2.301767349243164, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.30713677406311035, "rewards/reward_search_strategy": 0.25, "step": 173 }, { "completion_length": 380.5, "epoch": 0.6083916083916084, "grad_norm": 1.0713775157928467, "kl": 0.08360132575035095, "learning_rate": 4.917058471511149e-06, "loss": 0.0033, "reward": 2.460089683532715, "reward_std": 2.595458507537842, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.4350895881652832, "rewards/reward_search_strategy": 0.2750000059604645, "step": 174 }, { "completion_length": 529.25, "epoch": 0.6118881118881119, "grad_norm": 0.463553249835968, "kl": 0.05024154111742973, "learning_rate": 4.914814565722671e-06, "loss": 0.002, "reward": 3.3107078075408936, "reward_std": 2.1608943939208984, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.48570770025253296, "rewards/reward_search_strategy": 0.5750000476837158, "step": 175 }, { "completion_length": 482.75, "epoch": 0.6153846153846154, "grad_norm": 1.2290676832199097, "kl": 0.14474555850028992, "learning_rate": 4.912541236180779e-06, "loss": 0.0058, "reward": 3.2196455001831055, "reward_std": 2.8522043228149414, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2946455180644989, "rewards/reward_search_strategy": 0.42500001192092896, "step": 176 }, { "completion_length": 423.75, "epoch": 0.6188811188811189, "grad_norm": 1.102494716644287, "kl": 0.059243083000183105, "learning_rate": 4.910238510585275e-06, "loss": 0.0024, "reward": 2.8200690746307373, "reward_std": 2.557361364364624, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.39506906270980835, "rewards/reward_search_strategy": 0.42500001192092896, "step": 177 }, { "completion_length": 502.125, "epoch": 0.6223776223776224, "grad_norm": 1.207655668258667, "kl": 0.11078634113073349, "learning_rate": 4.907906416994146e-06, "loss": 0.0044, "reward": 4.589856147766113, "reward_std": 2.6245768070220947, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.48985564708709717, "rewards/reward_search_strategy": 0.6000000238418579, "step": 178 }, { "completion_length": 508.875, "epoch": 0.6258741258741258, "grad_norm": 1.0890285968780518, "kl": 0.08423227816820145, "learning_rate": 4.905544983823214e-06, "loss": 0.0034, "reward": 2.421203374862671, "reward_std": 1.544132113456726, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.44620347023010254, "rewards/reward_search_strategy": 0.3500000238418579, "step": 179 }, { "completion_length": 332.875, "epoch": 0.6293706293706294, "grad_norm": 1.8258018493652344, "kl": 0.15480613708496094, "learning_rate": 4.903154239845798e-06, "loss": 0.0062, "reward": 1.375791311264038, "reward_std": 1.4795390367507935, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.3257913589477539, "rewards/reward_search_strategy": 0.30000001192092896, "step": 180 }, { "completion_length": 312.25, "epoch": 0.6328671328671329, "grad_norm": 6.885568618774414, "kl": 0.5282591581344604, "learning_rate": 4.900734214192358e-06, "loss": 0.0211, "reward": 4.6441650390625, "reward_std": 3.3753483295440674, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5191651582717896, "rewards/reward_search_strategy": 0.5, "step": 181 }, { "completion_length": 442.375, "epoch": 0.6363636363636364, "grad_norm": 0.612838864326477, "kl": 0.08585107326507568, "learning_rate": 4.898284936350144e-06, "loss": 0.0034, "reward": 4.933065891265869, "reward_std": 3.642153739929199, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.7080655694007874, "rewards/reward_search_strategy": 0.4749999940395355, "step": 182 }, { "completion_length": 244.25, "epoch": 0.6398601398601399, "grad_norm": 1.8637006282806396, "kl": 0.38500577211380005, "learning_rate": 4.8958064361628334e-06, "loss": 0.0154, "reward": 6.256934642791748, "reward_std": 1.2352032661437988, "rewards/reward_correctness": 1.0, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.4569346308708191, "rewards/reward_search_strategy": 0.30000001192092896, "step": 183 }, { "completion_length": 373.5, "epoch": 0.6433566433566433, "grad_norm": 1.1437357664108276, "kl": 0.12481357157230377, "learning_rate": 4.893298743830168e-06, "loss": 0.005, "reward": 3.356623411178589, "reward_std": 2.37040638923645, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5566233992576599, "rewards/reward_search_strategy": 0.30000001192092896, "step": 184 }, { "completion_length": 549.125, "epoch": 0.6468531468531469, "grad_norm": 0.6662846207618713, "kl": 0.10428506135940552, "learning_rate": 4.890761889907589e-06, "loss": 0.0042, "reward": 6.009631633758545, "reward_std": 3.1016297340393066, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.7346314191818237, "rewards/reward_search_strategy": 0.6499999761581421, "step": 185 }, { "completion_length": 549.0, "epoch": 0.6503496503496503, "grad_norm": 0.7750718593597412, "kl": 0.09990722686052322, "learning_rate": 4.888195905305859e-06, "loss": 0.004, "reward": 4.673529624938965, "reward_std": 2.434964656829834, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 1.0, "rewards/reward_search_diversity": 0.5485298037528992, "rewards/reward_search_strategy": 0.625, "step": 186 }, { "completion_length": 531.625, "epoch": 0.6538461538461539, "grad_norm": 0.8020265102386475, "kl": 0.09699496626853943, "learning_rate": 4.885600821290692e-06, "loss": 0.0039, "reward": 3.3051586151123047, "reward_std": 1.4629502296447754, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.4051586389541626, "rewards/reward_search_strategy": 0.4000000059604645, "step": 187 }, { "completion_length": 522.875, "epoch": 0.6573426573426573, "grad_norm": 1.3859021663665771, "kl": 0.13135036826133728, "learning_rate": 4.882976669482368e-06, "loss": 0.0053, "reward": 2.6884827613830566, "reward_std": 2.2438197135925293, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.388482928276062, "rewards/reward_search_strategy": 0.30000001192092896, "step": 188 }, { "completion_length": 518.5, "epoch": 0.6608391608391608, "grad_norm": 9.410249710083008, "kl": 0.6956667900085449, "learning_rate": 4.880323481855347e-06, "loss": 0.0278, "reward": 4.050230979919434, "reward_std": 3.249497652053833, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.37523072957992554, "rewards/reward_search_strategy": 0.550000011920929, "step": 189 }, { "completion_length": 386.5, "epoch": 0.6643356643356644, "grad_norm": 0.8387676477432251, "kl": 0.08535090833902359, "learning_rate": 4.8776412907378845e-06, "loss": 0.0034, "reward": 4.588320255279541, "reward_std": 2.220287799835205, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.875, "rewards/reward_search_diversity": 0.28832051157951355, "rewards/reward_search_strategy": 0.550000011920929, "step": 190 }, { "completion_length": 408.5, "epoch": 0.6678321678321678, "grad_norm": 2.006375551223755, "kl": 0.11828400194644928, "learning_rate": 4.874930128811631e-06, "loss": 0.0047, "reward": 3.0512070655822754, "reward_std": 2.421907424926758, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.30120688676834106, "rewards/reward_search_strategy": 0.5, "step": 191 }, { "completion_length": 245.75, "epoch": 0.6713286713286714, "grad_norm": 1.0472596883773804, "kl": 0.12134737521409988, "learning_rate": 4.8721900291112415e-06, "loss": 0.0049, "reward": 4.235280990600586, "reward_std": 3.679039716720581, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.3602810502052307, "rewards/reward_search_strategy": 0.5, "step": 192 }, { "completion_length": 318.625, "epoch": 0.6748251748251748, "grad_norm": 1.2686798572540283, "kl": 0.17116394639015198, "learning_rate": 4.869421025023965e-06, "loss": 0.0068, "reward": 4.4517951011657715, "reward_std": 3.264738082885742, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5267948508262634, "rewards/reward_search_strategy": 0.42500001192092896, "step": 193 }, { "completion_length": 284.0, "epoch": 0.6783216783216783, "grad_norm": 1.0447121858596802, "kl": 0.2941432595252991, "learning_rate": 4.866623150289241e-06, "loss": 0.0118, "reward": 1.9646129608154297, "reward_std": 2.391249418258667, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.33961305022239685, "rewards/reward_search_strategy": 0.25, "step": 194 }, { "completion_length": 497.0, "epoch": 0.6818181818181818, "grad_norm": 0.604982852935791, "kl": 0.06880504637956619, "learning_rate": 4.863796438998293e-06, "loss": 0.0028, "reward": 6.05696964263916, "reward_std": 2.8059637546539307, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 1.0, "rewards/reward_format": 0.625, "rewards/reward_search_diversity": 0.5319693684577942, "rewards/reward_search_strategy": 0.7749999761581421, "step": 195 }, { "completion_length": 322.625, "epoch": 0.6853146853146853, "grad_norm": 1.2860020399093628, "kl": 0.195574089884758, "learning_rate": 4.860940925593703e-06, "loss": 0.0078, "reward": 4.560000419616699, "reward_std": 2.813296318054199, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.31000030040740967, "rewards/reward_search_strategy": 0.375, "step": 196 }, { "completion_length": 536.75, "epoch": 0.6888111888111889, "grad_norm": 0.6069127321243286, "kl": 0.055065739899873734, "learning_rate": 4.858056644869002e-06, "loss": 0.0022, "reward": 4.489222049713135, "reward_std": 2.621090888977051, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5142220854759216, "rewards/reward_search_strategy": 0.6000000238418579, "step": 197 }, { "completion_length": 288.125, "epoch": 0.6923076923076923, "grad_norm": 1.4299031496047974, "kl": 0.28020644187927246, "learning_rate": 4.855143631968242e-06, "loss": 0.0112, "reward": 1.7020204067230225, "reward_std": 1.4690635204315186, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.5, "rewards/reward_search_diversity": 0.2770204544067383, "rewards/reward_search_strategy": 0.42500001192092896, "step": 198 }, { "completion_length": 451.625, "epoch": 0.6958041958041958, "grad_norm": 0.6012458205223083, "kl": 0.10064040124416351, "learning_rate": 4.852201922385564e-06, "loss": 0.004, "reward": 4.045350551605225, "reward_std": 2.6484973430633545, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.5953505635261536, "rewards/reward_search_strategy": 0.5750000476837158, "step": 199 }, { "completion_length": 381.75, "epoch": 0.6993006993006993, "grad_norm": 1.712384819984436, "kl": 0.23387499153614044, "learning_rate": 4.849231551964771e-06, "loss": 0.0094, "reward": 6.612201690673828, "reward_std": 3.49477481842041, "rewards/reward_correctness": 0.75, "rewards/reward_em_chunk": 0.875, "rewards/reward_format": 0.75, "rewards/reward_search_diversity": 0.4872019290924072, "rewards/reward_search_strategy": 0.75, "step": 200 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }