{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17482517482517482, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 145.875, "epoch": 0.0034965034965034965, "grad_norm": 8.602514266967773, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 0.7051772475242615, "reward_std": 0.8263505697250366, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3301772475242615, "rewards/reward_search_strategy": 0.0, "step": 1 }, { "completion_length": 88.125, "epoch": 0.006993006993006993, "grad_norm": 1.4985997676849365, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": -0.0, "reward": 1.8388628959655762, "reward_std": 2.5329272747039795, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2388630509376526, "rewards/reward_search_strategy": 0.10000000149011612, "step": 2 }, { "completion_length": 296.5, "epoch": 0.01048951048951049, "grad_norm": 0.7674520611763, "kl": 0.0006381775019690394, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 1.654070258140564, "reward_std": 2.827007532119751, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.5, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.17907029390335083, "rewards/reward_search_strategy": 0.22499999403953552, "step": 3 }, { "completion_length": 290.125, "epoch": 0.013986013986013986, "grad_norm": 1.0457236766815186, "kl": 0.0007995082996785641, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 3.421433448791504, "reward_std": 2.9866631031036377, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.396433562040329, "rewards/reward_search_strategy": 0.2750000059604645, "step": 4 }, { "completion_length": 259.125, "epoch": 0.017482517482517484, "grad_norm": 1.3121755123138428, "kl": 0.000728036102373153, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.9268761873245239, "reward_std": 0.7560767531394958, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.2268761545419693, "rewards/reward_search_strategy": 0.07500000298023224, "step": 5 }, { "completion_length": 383.5, "epoch": 0.02097902097902098, "grad_norm": 0.9658453464508057, "kl": 0.0007470359560102224, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 1.4594597816467285, "reward_std": 2.2520837783813477, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.25945988297462463, "rewards/reward_search_strategy": 0.07500000298023224, "step": 6 }, { "completion_length": 144.375, "epoch": 0.024475524475524476, "grad_norm": 1.1642646789550781, "kl": 0.0009361266857013106, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": 1.378964900970459, "reward_std": 2.0963218212127686, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30396491289138794, "rewards/reward_search_strategy": 0.07500000298023224, "step": 7 }, { "completion_length": 314.375, "epoch": 0.027972027972027972, "grad_norm": 5.21930456161499, "kl": 0.0008346753311343491, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 1.181166410446167, "reward_std": 1.6016024351119995, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3561664819717407, "rewards/reward_search_strategy": 0.07500000298023224, "step": 8 }, { "completion_length": 98.375, "epoch": 0.03146853146853147, "grad_norm": 1.6168773174285889, "kl": 0.0009058149298653007, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 1.141206979751587, "reward_std": 2.1089890003204346, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16620691120624542, "rewards/reward_search_strategy": 0.10000000149011612, "step": 9 }, { "completion_length": 269.625, "epoch": 0.03496503496503497, "grad_norm": 1.310072660446167, "kl": 0.000750248203985393, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 1.901651382446289, "reward_std": 3.08128023147583, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.27665144205093384, "rewards/reward_search_strategy": 0.125, "step": 10 }, { "completion_length": 311.125, "epoch": 0.038461538461538464, "grad_norm": 1.1051065921783447, "kl": 0.000834438658785075, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 2.1270594596862793, "reward_std": 2.309929132461548, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.377059668302536, "rewards/reward_search_strategy": 0.125, "step": 11 }, { "completion_length": 182.375, "epoch": 0.04195804195804196, "grad_norm": 1.3039528131484985, "kl": 0.0007587745785713196, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.24762853980064392, "reward_std": 0.28513866662979126, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.17262855172157288, "rewards/reward_search_strategy": 0.07500000298023224, "step": 12 }, { "completion_length": 198.5, "epoch": 0.045454545454545456, "grad_norm": 1.176461935043335, "kl": 0.0008510244661010802, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 2.0060770511627197, "reward_std": 2.2957346439361572, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3060770034790039, "rewards/reward_search_strategy": 0.07500000298023224, "step": 13 }, { "completion_length": 158.25, "epoch": 0.04895104895104895, "grad_norm": 1.1484898328781128, "kl": 0.0009028888889588416, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": 0.2598831355571747, "reward_std": 0.4156216084957123, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1348831057548523, "rewards/reward_search_strategy": 0.0, "step": 14 }, { "completion_length": 321.125, "epoch": 0.05244755244755245, "grad_norm": 1.08210027217865, "kl": 0.0007499511120840907, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 2.711674213409424, "reward_std": 3.2537524700164795, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3866744041442871, "rewards/reward_search_strategy": 0.20000000298023224, "step": 15 }, { "completion_length": 176.0, "epoch": 0.055944055944055944, "grad_norm": 1.007460117340088, "kl": 0.0008281145128421485, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 1.6642776727676392, "reward_std": 3.1836605072021484, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1892777681350708, "rewards/reward_search_strategy": 0.10000000149011612, "step": 16 }, { "completion_length": 299.0, "epoch": 0.05944055944055944, "grad_norm": 0.9308914542198181, "kl": 0.0006533896084874868, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 2.9600870609283447, "reward_std": 3.002318859100342, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.38508710265159607, "rewards/reward_search_strategy": 0.07500000298023224, "step": 17 }, { "completion_length": 170.125, "epoch": 0.06293706293706294, "grad_norm": 1.6546720266342163, "kl": 0.0009436010150238872, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": 1.807711124420166, "reward_std": 2.873779773712158, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.33271118998527527, "rewards/reward_search_strategy": 0.22499999403953552, "step": 18 }, { "completion_length": 377.125, "epoch": 0.06643356643356643, "grad_norm": 0.7977221608161926, "kl": 0.0008231342071667314, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 0.7038719058036804, "reward_std": 1.2563936710357666, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.22887186706066132, "rewards/reward_search_strategy": 0.10000000149011612, "step": 19 }, { "completion_length": 161.625, "epoch": 0.06993006993006994, "grad_norm": 1.5974496603012085, "kl": 0.000790413178037852, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 2.404862880706787, "reward_std": 2.712550163269043, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.179862841963768, "rewards/reward_search_strategy": 0.10000000149011612, "step": 20 }, { "completion_length": 298.875, "epoch": 0.07342657342657342, "grad_norm": 0.8817616105079651, "kl": 0.0007674504304304719, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": 2.6999998092651367, "reward_std": 3.122270345687866, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.375, "rewards/reward_search_diversity": 0.05000000074505806, "rewards/reward_search_strategy": 0.2750000059604645, "step": 21 }, { "completion_length": 150.125, "epoch": 0.07692307692307693, "grad_norm": 1.345902919769287, "kl": 0.0010101046646013856, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 1.2625212669372559, "reward_std": 1.9802589416503906, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.13752126693725586, "rewards/reward_search_strategy": 0.0, "step": 22 }, { "completion_length": 156.625, "epoch": 0.08041958041958042, "grad_norm": 3.1841650009155273, "kl": 0.0009282200480811298, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": 2.532832145690918, "reward_std": 3.623222827911377, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.18283231556415558, "rewards/reward_search_strategy": 0.22500000894069672, "step": 23 }, { "completion_length": 230.625, "epoch": 0.08391608391608392, "grad_norm": 0.560590922832489, "kl": 0.0008200581069104373, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.7196725606918335, "reward_std": 1.5131947994232178, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.1946725845336914, "rewards/reward_search_strategy": 0.02500000037252903, "step": 24 }, { "completion_length": 232.5, "epoch": 0.08741258741258741, "grad_norm": 5.627699375152588, "kl": 0.0007359281880781054, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 2.3161444664001465, "reward_std": 2.266465425491333, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.31614452600479126, "rewards/reward_search_strategy": 0.0, "step": 25 }, { "completion_length": 256.625, "epoch": 0.09090909090909091, "grad_norm": 1.243622064590454, "kl": 0.0009776452789083123, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 1.8542793989181519, "reward_std": 2.3880927562713623, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.22927939891815186, "rewards/reward_search_strategy": 0.0, "step": 26 }, { "completion_length": 221.625, "epoch": 0.0944055944055944, "grad_norm": 2.455430030822754, "kl": 0.0014260835014283657, "learning_rate": 1.3500000000000002e-06, "loss": 0.0001, "reward": 1.0012282133102417, "reward_std": 1.7000349760055542, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.15122826397418976, "rewards/reward_search_strategy": 0.10000000149011612, "step": 27 }, { "completion_length": 185.75, "epoch": 0.0979020979020979, "grad_norm": 1.199497938156128, "kl": 0.0007826727814972401, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 2.173332691192627, "reward_std": 2.801347255706787, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.32333269715309143, "rewards/reward_search_strategy": 0.10000000149011612, "step": 28 }, { "completion_length": 243.125, "epoch": 0.10139860139860139, "grad_norm": 0.8430306315422058, "kl": 0.0008962135761976242, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 1.0625323057174683, "reward_std": 1.898645043373108, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18753226101398468, "rewards/reward_search_strategy": 0.0, "step": 29 }, { "completion_length": 334.875, "epoch": 0.1048951048951049, "grad_norm": 0.8412113785743713, "kl": 0.001170428702607751, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 1.303612232208252, "reward_std": 2.1826775074005127, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.27861225605010986, "rewards/reward_search_strategy": 0.02500000037252903, "step": 30 }, { "completion_length": 165.75, "epoch": 0.10839160839160839, "grad_norm": 1.9105679988861084, "kl": 0.0031302073039114475, "learning_rate": 1.5500000000000002e-06, "loss": 0.0001, "reward": 1.0198408365249634, "reward_std": 2.1316380500793457, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.16984085738658905, "rewards/reward_search_strategy": 0.10000000149011612, "step": 31 }, { "completion_length": 174.625, "epoch": 0.11188811188811189, "grad_norm": 2.309382915496826, "kl": 0.0032356895972043276, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "reward": 0.9216341376304626, "reward_std": 1.8671420812606812, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.25, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14663413166999817, "rewards/reward_search_strategy": 0.02500000037252903, "step": 32 }, { "completion_length": 85.75, "epoch": 0.11538461538461539, "grad_norm": 1.4122040271759033, "kl": 0.0017049856251105666, "learning_rate": 1.6500000000000003e-06, "loss": 0.0001, "reward": 1.524999976158142, "reward_std": 2.2926902770996094, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.10000000149011612, "rewards/reward_search_strategy": 0.05000000074505806, "step": 33 }, { "completion_length": 173.5, "epoch": 0.11888111888111888, "grad_norm": 0.9712541699409485, "kl": 0.0014842856908217072, "learning_rate": 1.7000000000000002e-06, "loss": 0.0001, "reward": 3.4844837188720703, "reward_std": 3.1145381927490234, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.3094840347766876, "rewards/reward_search_strategy": 0.17499999701976776, "step": 34 }, { "completion_length": 268.25, "epoch": 0.12237762237762238, "grad_norm": 1.1684075593948364, "kl": 0.0012478481512516737, "learning_rate": 1.75e-06, "loss": 0.0, "reward": 2.8716678619384766, "reward_std": 2.387646198272705, "rewards/reward_correctness": 0.5, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.3716679811477661, "rewards/reward_search_strategy": 0.125, "step": 35 }, { "completion_length": 235.125, "epoch": 0.1258741258741259, "grad_norm": 1.2749513387680054, "kl": 0.004472827073186636, "learning_rate": 1.8000000000000001e-06, "loss": 0.0002, "reward": 1.8775699138641357, "reward_std": 2.413510322570801, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.30256983637809753, "rewards/reward_search_strategy": 0.07500000298023224, "step": 36 }, { "completion_length": 175.5, "epoch": 0.12937062937062938, "grad_norm": 0.8929537534713745, "kl": 0.0037049425300210714, "learning_rate": 1.85e-06, "loss": 0.0001, "reward": 0.6821428537368774, "reward_std": 1.4377206563949585, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.18214285373687744, "rewards/reward_search_strategy": 0.0, "step": 37 }, { "completion_length": 290.125, "epoch": 0.13286713286713286, "grad_norm": 2.52665114402771, "kl": 0.005157872103154659, "learning_rate": 1.9000000000000002e-06, "loss": 0.0002, "reward": 2.574741840362549, "reward_std": 2.5146002769470215, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.14974190294742584, "rewards/reward_search_strategy": 0.17500001192092896, "step": 38 }, { "completion_length": 344.125, "epoch": 0.13636363636363635, "grad_norm": 0.8259175419807434, "kl": 0.002582128159701824, "learning_rate": 1.9500000000000004e-06, "loss": 0.0001, "reward": 2.376612901687622, "reward_std": 2.9910624027252197, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.4266127943992615, "rewards/reward_search_strategy": 0.07500000298023224, "step": 39 }, { "completion_length": 243.125, "epoch": 0.13986013986013987, "grad_norm": 1.0021271705627441, "kl": 0.004831339232623577, "learning_rate": 2.0000000000000003e-06, "loss": 0.0002, "reward": 2.04349946975708, "reward_std": 2.3903656005859375, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.34349945187568665, "rewards/reward_search_strategy": 0.07500000298023224, "step": 40 }, { "completion_length": 207.375, "epoch": 0.14335664335664336, "grad_norm": 1.2446260452270508, "kl": 0.005710378754884005, "learning_rate": 2.05e-06, "loss": 0.0002, "reward": 2.8324475288391113, "reward_std": 3.167931079864502, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.25744765996932983, "rewards/reward_search_strategy": 0.20000000298023224, "step": 41 }, { "completion_length": 441.125, "epoch": 0.14685314685314685, "grad_norm": 0.8263920545578003, "kl": 0.0030913222581148148, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "reward": 2.5794529914855957, "reward_std": 2.6344144344329834, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.30445292592048645, "rewards/reward_search_strategy": 0.15000000596046448, "step": 42 }, { "completion_length": 226.375, "epoch": 0.15034965034965034, "grad_norm": 1.141964316368103, "kl": 0.008909309282898903, "learning_rate": 2.15e-06, "loss": 0.0004, "reward": 1.8603672981262207, "reward_std": 2.203197479248047, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.21036729216575623, "rewards/reward_search_strategy": 0.02500000037252903, "step": 43 }, { "completion_length": 397.75, "epoch": 0.15384615384615385, "grad_norm": 0.9564006328582764, "kl": 0.01092858798801899, "learning_rate": 2.2e-06, "loss": 0.0004, "reward": 2.557056188583374, "reward_std": 2.429002046585083, "rewards/reward_correctness": 0.375, "rewards/reward_em_chunk": 0.75, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.28205621242523193, "rewards/reward_search_strategy": 0.02500000037252903, "step": 44 }, { "completion_length": 307.25, "epoch": 0.15734265734265734, "grad_norm": 1.0510841608047485, "kl": 0.007918575778603554, "learning_rate": 2.25e-06, "loss": 0.0003, "reward": 1.2331278324127197, "reward_std": 1.7548075914382935, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.0, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.15812794864177704, "rewards/reward_search_strategy": 0.07500000298023224, "step": 45 }, { "completion_length": 277.0, "epoch": 0.16083916083916083, "grad_norm": 0.9169591665267944, "kl": 0.015062836930155754, "learning_rate": 2.3000000000000004e-06, "loss": 0.0006, "reward": 1.2089338302612305, "reward_std": 2.5115318298339844, "rewards/reward_correctness": 0.125, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.125, "rewards/reward_search_diversity": 0.1339338719844818, "rewards/reward_search_strategy": 0.20000000298023224, "step": 46 }, { "completion_length": 206.25, "epoch": 0.16433566433566432, "grad_norm": 1.3427180051803589, "kl": 0.019624339416623116, "learning_rate": 2.35e-06, "loss": 0.0008, "reward": 4.1133527755737305, "reward_std": 3.2583820819854736, "rewards/reward_correctness": 0.625, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.25, "rewards/reward_search_diversity": 0.36335283517837524, "rewards/reward_search_strategy": 0.125, "step": 47 }, { "completion_length": 252.125, "epoch": 0.16783216783216784, "grad_norm": 0.7444542050361633, "kl": 0.021463895216584206, "learning_rate": 2.4000000000000003e-06, "loss": 0.0009, "reward": 1.631263017654419, "reward_std": 2.5925517082214355, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.375, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.23126289248466492, "rewards/reward_search_strategy": 0.02500000037252903, "step": 48 }, { "completion_length": 117.625, "epoch": 0.17132867132867133, "grad_norm": 1.888673186302185, "kl": 0.03609447553753853, "learning_rate": 2.4500000000000003e-06, "loss": 0.0014, "reward": 1.9036250114440918, "reward_std": 2.2273151874542236, "rewards/reward_correctness": 0.25, "rewards/reward_em_chunk": 0.625, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.20362500846385956, "rewards/reward_search_strategy": 0.07500000298023224, "step": 49 }, { "completion_length": 302.75, "epoch": 0.17482517482517482, "grad_norm": 1.2723031044006348, "kl": 0.2012804001569748, "learning_rate": 2.5e-06, "loss": 0.0081, "reward": 0.15956448018550873, "reward_std": 0.4513165056705475, "rewards/reward_correctness": 0.0, "rewards/reward_em_chunk": 0.125, "rewards/reward_format": 0.0, "rewards/reward_search_diversity": 0.03456447646021843, "rewards/reward_search_strategy": 0.0, "step": 50 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }